Lots of information about this software and different vignettes can be found online. For example – a recent vignette from the developers of the software; http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#independent-filtering-of-results
Set current working directory where everything will be saved and stored
setwd("~/Documents/EPO Project/CRISPR/Whole gene knock-out/Confirming EPO KO/RNA Sequencing/DeSeq2/")
Load in packages which will be needed
library(magrittr)
read.counts <- read.table("~/Documents/EPO Project/CRISPR/Whole gene knock-out/Confirming EPO KO/RNA Sequencing/FeatureCounts/feature_count_results_geneid.txt", sep="\t", header = TRUE)
The results can now handled as a dataframe in the R environment
head(read.counts, n=3)
## Geneid Chr
## 1 ENSG00000223972 1;1;1;1;1;1;1;1;1
## 2 ENSG00000227232 1;1;1;1;1;1;1;1;1;1;1
## 3 ENSG00000278267 1
## Start
## 1 11869;12010;12179;12613;12613;12975;13221;13221;13453
## 2 14404;15005;15796;16607;16858;17233;17606;17915;18268;24738;29534
## 3 17369
## End
## 1 12227;12057;12227;12721;12697;13052;13374;14409;13670
## 2 14501;15038;15947;16765;17055;17368;17742;18061;18366;24891;29570
## 3 17436
## Strand Length
## 1 +;+;+;+;+;+;+;+;+ 1735
## 2 -;-;-;-;-;-;-;-;-;-;- 1351
## 3 - 68
## X.gpfs.mrc0.projects.Research_Project.MRC158833.cs660.EPO_project.RNA_sequencing_analysis.Star_Alignment_default.test_2.FeatureCounts.Empty1_alignment_default_SortedByReadName.out.bam
## 1 0
## 2 11
## 3 1
## X.gpfs.mrc0.projects.Research_Project.MRC158833.cs660.EPO_project.RNA_sequencing_analysis.Star_Alignment_default.test_2.FeatureCounts.Empty2_alignment_default_SortedByReadName.out.bam
## 1 0
## 2 11
## 3 0
## X.gpfs.mrc0.projects.Research_Project.MRC158833.cs660.EPO_project.RNA_sequencing_analysis.Star_Alignment_default.test_2.FeatureCounts.Empty3_alignment_default_SortedByReadName.out.bam
## 1 0
## 2 11
## 3 1
## X.gpfs.mrc0.projects.Research_Project.MRC158833.cs660.EPO_project.RNA_sequencing_analysis.Star_Alignment_default.test_2.FeatureCounts.Empty4_alignment_default_SortedByReadName.out.bam
## 1 0
## 2 10
## 3 0
## X.gpfs.mrc0.projects.Research_Project.MRC158833.cs660.EPO_project.RNA_sequencing_analysis.Star_Alignment_default.test_2.FeatureCounts.KOA1_alignment_default_SortedByReadName.out.bam
## 1 0
## 2 4
## 3 1
## X.gpfs.mrc0.projects.Research_Project.MRC158833.cs660.EPO_project.RNA_sequencing_analysis.Star_Alignment_default.test_2.FeatureCounts.KOA2_alignment_default_SortedByReadName.out.bam
## 1 0
## 2 7
## 3 1
## X.gpfs.mrc0.projects.Research_Project.MRC158833.cs660.EPO_project.RNA_sequencing_analysis.Star_Alignment_default.test_2.FeatureCounts.KOA3_alignment_default_SortedByReadName.out.bam
## 1 0
## 2 5
## 3 0
## X.gpfs.mrc0.projects.Research_Project.MRC158833.cs660.EPO_project.RNA_sequencing_analysis.Star_Alignment_default.test_2.FeatureCounts.KOA5_alignment_default_SortedByReadName.out.bam
## 1 0
## 2 4
## 3 0
## X.gpfs.mrc0.projects.Research_Project.MRC158833.cs660.EPO_project.RNA_sequencing_analysis.Star_Alignment_default.test_2.FeatureCounts.KOB1_alignment_default_SortedByReadName.out.bam
## 1 0
## 2 5
## 3 2
## X.gpfs.mrc0.projects.Research_Project.MRC158833.cs660.EPO_project.RNA_sequencing_analysis.Star_Alignment_default.test_2.FeatureCounts.KOB211_alignment_default_SortedByReadName.out.bam
## 1 0
## 2 5
## 3 2
## X.gpfs.mrc0.projects.Research_Project.MRC158833.cs660.EPO_project.RNA_sequencing_analysis.Star_Alignment_default.test_2.FeatureCounts.KOB3_alignment_default_SortedByReadName.out.bam
## 1 0
## 2 8
## 3 0
## X.gpfs.mrc0.projects.Research_Project.MRC158833.cs660.EPO_project.RNA_sequencing_analysis.Star_Alignment_default.test_2.FeatureCounts.KOB5_alignment_default_SortedByReadName.out.bam
## 1 0
## 2 10
## 3 0
Replace all row names with the names of genes
row.names(read.counts) <- read.counts$Geneid
Remove the irrelevant columns which contain no count data
read.counts <- read.counts[,-c(1:6)]
Give meaningful sample names to the columns if your data-frame does not already have clear names - this can be achieved via numerous approaches
names(read.counts) <- c("WT1", "WT2","WT3","WT4","KOA1", "KOA2", "KOA3", "KOA4", "KOB1", "KOB2", "KOB3", "KOB4")
# Check data is what we expect
str(read.counts)
## 'data.frame': 60623 obs. of 12 variables:
## $ WT1 : int 0 11 1 1 0 0 0 0 0 1 ...
## $ WT2 : int 0 11 0 1 0 0 0 0 0 0 ...
## $ WT3 : int 0 11 1 0 0 0 0 0 0 0 ...
## $ WT4 : int 0 10 0 0 0 0 0 0 0 1 ...
## $ KOA1: int 0 4 1 1 0 0 0 0 0 0 ...
## $ KOA2: int 0 7 1 0 0 0 0 0 0 0 ...
## $ KOA3: int 0 5 0 0 0 0 0 0 0 0 ...
## $ KOA4: int 0 4 0 0 0 0 0 0 0 0 ...
## $ KOB1: int 0 5 2 1 0 0 0 0 0 0 ...
## $ KOB2: int 0 5 2 0 0 0 0 0 0 1 ...
## $ KOB3: int 0 8 0 1 0 0 0 0 0 0 ...
## $ KOB4: int 0 10 0 0 0 0 0 0 0 0 ...
head (read.counts, n = 3)
## WT1 WT2 WT3 WT4 KOA1 KOA2 KOA3 KOA4 KOB1 KOB2 KOB3 KOB4
## ENSG00000223972 0 0 0 0 0 0 0 0 0 0 0 0
## ENSG00000227232 11 11 11 10 4 7 5 4 5 5 8 10
## ENSG00000278267 1 0 1 0 1 1 0 0 2 2 0 0
Extract just read count data for Control & KOA or Control & KOB for individual analysis
read.counts.KOA <- read.counts[,-c(9:12)]
str(read.counts.KOA)
## 'data.frame': 60623 obs. of 8 variables:
## $ WT1 : int 0 11 1 1 0 0 0 0 0 1 ...
## $ WT2 : int 0 11 0 1 0 0 0 0 0 0 ...
## $ WT3 : int 0 11 1 0 0 0 0 0 0 0 ...
## $ WT4 : int 0 10 0 0 0 0 0 0 0 1 ...
## $ KOA1: int 0 4 1 1 0 0 0 0 0 0 ...
## $ KOA2: int 0 7 1 0 0 0 0 0 0 0 ...
## $ KOA3: int 0 5 0 0 0 0 0 0 0 0 ...
## $ KOA4: int 0 4 0 0 0 0 0 0 0 0 ...
head(read.counts.KOA, n = 3)
## WT1 WT2 WT3 WT4 KOA1 KOA2 KOA3 KOA4
## ENSG00000223972 0 0 0 0 0 0 0 0
## ENSG00000227232 11 11 11 10 4 7 5 4
## ENSG00000278267 1 0 1 0 1 1 0 0
read.counts.KOB <- read.counts[,-c(5:8)]
str(read.counts.KOB)
## 'data.frame': 60623 obs. of 8 variables:
## $ WT1 : int 0 11 1 1 0 0 0 0 0 1 ...
## $ WT2 : int 0 11 0 1 0 0 0 0 0 0 ...
## $ WT3 : int 0 11 1 0 0 0 0 0 0 0 ...
## $ WT4 : int 0 10 0 0 0 0 0 0 0 1 ...
## $ KOB1: int 0 5 2 1 0 0 0 0 0 0 ...
## $ KOB2: int 0 5 2 0 0 0 0 0 0 1 ...
## $ KOB3: int 0 8 0 1 0 0 0 0 0 0 ...
## $ KOB4: int 0 10 0 0 0 0 0 0 0 0 ...
head (read.counts.KOB, n = 3)
## WT1 WT2 WT3 WT4 KOB1 KOB2 KOB3 KOB4
## ENSG00000223972 0 0 0 0 0 0 0 0
## ENSG00000227232 11 11 11 10 5 5 8 10
## ENSG00000278267 1 0 1 0 2 2 0 0
gtf <- read.table("~/Downloads/Homo_sapiens.GRCh38.98_gene_annotation_table.txt", sep="\t", header=T)
gtf$Geneid <- gtf$gene_id
Create the ColData for DeSeq2 which contains information on the conditions, confounders etc The conditions and sample names should correspond to the column names of read.counts
Create a data-frame with a column called condition which is WT or KO Use the column names of read.counts dataframe i.e WT1,WT2 etc but remove the digits so just called WT or KOA or KOB
sample_info <- data.frame(condition = gsub("[0-9]+", "", names(read.counts)),
row.names = names(read.counts))
Replace WT with Control
sample_info$condition <- ifelse(sample_info$condition=="WT", "Control", ifelse(sample_info$condition=="KOA", "KOA", "KOB"))
Change to a factor column not character
sample_info$condition <- as.factor(sample_info$condition)
Alter the levels so that WT is recognised as ‘level 1’
sample_info$condition %<>% relevel("Control")
Check we have what we want
sample_info$condition
## [1] Control Control Control Control KOA KOA KOA KOA KOB
## [10] KOB KOB KOB
## Levels: Control KOA KOB
Create a column of sample name using the column names of the read.counts dataframe
sample_info$sample <- names(read.counts)
sample_info$sample <- factor(sample_info$sample, levels=c("WT1", "WT2","WT3","WT4","KOA1","KOA2","KOA3","KOA4","KOB1","KOB2","KOB3","KOB4"))
sample_info$sample %<>% relevel("WT1")
Create a column for biological replicate where WT sample 1 = 1, WT 2=2 etc
library(stringr)
regexp <- "[[:digit:]]+"
sample_info$rep <- str_extract(names(read.counts), regexp)
sample_info$rep <- as.factor(sample_info$rep)
Create a column for Genotype where WT or KO i.e combines KOA & KOB to just KO Use this column to combine all KO samples
sample_info$genotype <- ifelse(sample_info$condition=="Control", "WT", ifelse(sample_info$condition=="KOA", "KO", "KO"))
sample_info$genotype <- as.factor(sample_info$genotype) # change column to factor not character
sample_info$genotype %<>% relevel("WT") # relevel to ensure WT is level 1
sample_info$genotype # check it is what we expect
## [1] WT WT WT WT KO KO KO KO KO KO KO KO
## Levels: WT KO
Create a column for cell line where WT is one cell line, KOA is another cell line regardless of replicate and KOB is a cellline regardless of replicate
sample_info$cellline <- ifelse(sample_info$condition=="Control", "0", ifelse(sample_info$condition=="KOA", "1", "2"))
sample_info$cellline <- as.factor(sample_info$cellline) # change to factor
Set up meta-data just for KOA vs Control
sample_info_KOA <- data.frame(condition = gsub("[0-9]+", "", names(read.counts.KOA)),
row.names = names(read.counts.KOA))
sample_info_KOA$condition <- ifelse(sample_info_KOA$condition=="WT", "Control", "KOA")
sample_info_KOA$condition <- as.factor(sample_info_KOA$condition)
# alter the levels so that WT is recogised as 'level 1'
sample_info_KOA$condition %<>% relevel("Control")
sample_info_KOA$condition
## [1] Control Control Control Control KOA KOA KOA KOA
## Levels: Control KOA
# create a column of sample name
sample_info_KOA$sample <- names(read.counts.KOA)
sample_info_KOA$sample <- factor(sample_info_KOA$sample, levels=c("WT1", "WT2","WT3","WT4","KOA1","KOA2","KOA3","KOA4"))
sample_info_KOA$sample %<>% relevel("WT1")
# create a column for biological replicate where WT sample 1 = 1, WT 2=2 etc
library(stringr)
regexp <- "[[:digit:]]+"
sample_info_KOA$rep <- str_extract(names(read.counts.KOA), regexp)
sample_info_KOA$rep <- as.factor(sample_info_KOA$rep)
# create a column for Genotype where name WT or KO
# use this column to combine all KO samples
sample_info_KOA$genotype <- ifelse(sample_info_KOA$condition=="Control", "WT", ifelse(sample_info_KOA$condition=="KOA", "KO", "KO"))
sample_info_KOA$genotype <- as.factor(sample_info_KOA$genotype)
sample_info_KOA$genotype %<>% relevel("WT")
sample_info_KOA$genotype
## [1] WT WT WT WT KO KO KO KO
## Levels: WT KO
# create a column for cell line where WT is one cell line, KOA is another cell line regardless of replicate and KOB is a cellline regardless of replicate
sample_info_KOA$cellline <- ifelse(sample_info_KOA$condition=="Control", "0", ifelse(sample_info_KOA$condition=="KOA", "1", "2"))
sample_info_KOA$cellline <- as.factor(sample_info_KOA$cellline)
Set up meta-data just for KOB vs Control
## KOB meta-data
sample_info_KOB <- data.frame(condition = gsub("[0-9]+", "", names(read.counts.KOB)),
row.names = names(read.counts.KOB))
sample_info_KOB$condition <- ifelse(sample_info_KOB$condition=="WT", "Control", "KOB")
sample_info_KOB$condition <- as.factor(sample_info_KOB$condition)
# alter the levels so that WT is recogised as 'level 1'
sample_info_KOB$condition %<>% relevel("Control")
sample_info_KOB$condition
## [1] Control Control Control Control KOB KOB KOB KOB
## Levels: Control KOB
# create a column of sample name
sample_info_KOB$sample <- names(read.counts.KOB)
sample_info_KOB$sample <- factor(sample_info_KOB$sample, levels=c("WT1", "WT2","WT3","WT4","KOB1","KOB2","KOB3","KOB4"))
sample_info_KOB$sample %<>% relevel("WT1")
# create a column for biological replicate where WT sample 1 = 1, WT 2=2 etc
library(stringr)
regexp <- "[[:digit:]]+"
sample_info_KOB$rep <- str_extract(names(read.counts.KOB), regexp)
sample_info_KOB$rep <- as.factor(sample_info_KOB$rep)
# create a column for Genotype where name WT or KO
# use this column to combine all KO samples
sample_info_KOB$genotype <- ifelse(sample_info_KOB$condition=="Control", "WT", ifelse(sample_info_KOB$condition=="KOB", "KO", "KO"))
sample_info_KOB$genotype <- as.factor(sample_info_KOB$genotype)
sample_info_KOB$genotype %<>% relevel("WT")
sample_info_KOB$genotype
## [1] WT WT WT WT KO KO KO KO
## Levels: WT KO
# create a column for cell line where WT is one cell line, KOA is another cell line regardless of replicate and KOB is a cellline regardless of replicate
sample_info_KOB$cellline <- ifelse(sample_info_KOB$condition=="Control", "0", ifelse(sample_info_KOB$condition=="KOB", "1", "2"))
sample_info_KOB$cellline <- as.factor(sample_info_KOB$cellline)
Assign colours to the different groups in the meta-data data-frame # we can use these colours later to colour groups
library("RColorBrewer")
col.genotype <- colorRampPalette(c("royalblue", "red3"))(length(unique(sample_info$genotype)))[factor(sample_info$genotype)]
col.condition <- colorRampPalette(c("lightblue2", "orange", "deeppink"))(length(unique(sample_info$condition)))[factor(sample_info$condition)]
col.rep <- colorRampPalette(c("snow2", "azure2","lightblue2","steelblue2"))(length(unique(sample_info$rep)))[factor(sample_info$rep)]
col.sample <- colorRampPalette(c("cadetblue1", "cadetblue2","cadetblue3","cadetblue4", "brown1", "brown2","brown3","brown4","deeppink", "deeppink1", "deeppink2", "deeppink3"))(length(unique(rownames(sample_info))))
Can filter here or filter once data has been correctly read into DeSeq2 (commands for this below after reading data into DeSeq2)
Remove transcripts whose mean raw count across all samples falls below 10
#ZeroCountFilterIndices <- which(apply(read.counts, 1, mean)<10)
#print(paste("Total transcripts with mean<10 counts (all samples):", length(ZeroCountFilterIndices), sep=" "))
#if (length(ZeroCountFilterIndices)>0)
#{
# filtered_readcounts <- read.counts[-ZeroCountFilterIndices,]
#}
Check if all genes have at least 1 zero (generates error with DESeq2 1. This first converts the entire data frame to TRUE or FALSE (0 or non-zero) 2. It then applies the table function per row, which gives TRUE and FALSE tallies per gene 3. It then checks if any tally equals the total number of samples - as we’ve already eliminated genes with all 0 values, the only condition that can meet ncol(txi.working$counts) is the FALSE (non-zero) condition This produces a further TRUE or FALSE for each gene and condition 4. Finally, if any TRUE values are present, then we know that at least 1 row has a non-zeros, and therefore we can proceed
#if (!any(data.frame(unlist(apply((filtered_readcounts==0), 1, function(x) table(x))))==ncol(filtered_readcounts)))
#{
# print("All genes contain at least 1 zero.")
# next()
#}
Install DeSeq2
#BiocManager::install()
#BiocManager::install("DESeq2")
Load in DE-Seq2
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## Loading required package: BiocParallel
## Warning: multiple methods tables found for 'type'
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:BiocGenerics':
##
## type
## The following objects are masked from 'package:base':
##
## aperm, apply, rowsum
## Warning: replacing previous import 'BiocGenerics::type' by 'DelayedArray::type'
## when loading 'SummarizedExperiment'
library(DESeq2)
Convert the data to DESeq format and specify the model We want to test for the effect of genotype (combining all KOs as KO) Make sure that control/WT is the first level of a factor i.e the control level
library("magrittr")
# for all KOs combined
dds <- DESeqDataSetFromMatrix(countData=read.counts, #this is the count results data-frame,
colData=sample_info, #this is the data-frame containing information on samples,
design= ~genotype) #specify the design model - here we want to test WT against KO
# for Control vs KOA combined
dds_KOA <- DESeqDataSetFromMatrix(countData=read.counts.KOA, #this is the count results data-frame,
colData=sample_info_KOA, #this is the data-frame containing information on samples,
design= ~genotype) #specify the design model - here we want to test WT against KOA
# for Control vs KOB combined
dds_KOB <- DESeqDataSetFromMatrix(countData=read.counts.KOB, #this is the count results data-frame,
colData=sample_info_KOB, #this is the data-frame containing information on samples,
design= ~genotype) #specify the design model - here we want to test WT against KOB
Check the DeSeq2 dataset has been read in correctly
colData(dds) %>% head
## DataFrame with 6 rows and 5 columns
## condition sample rep genotype cellline
## <factor> <factor> <factor> <factor> <factor>
## WT1 Control WT1 1 WT 0
## WT2 Control WT2 2 WT 0
## WT3 Control WT3 3 WT 0
## WT4 Control WT4 4 WT 0
## KOA1 KOA KOA1 1 KO 1
## KOA2 KOA KOA2 2 KO 1
assay(dds, "counts") %>% head
## WT1 WT2 WT3 WT4 KOA1 KOA2 KOA3 KOA4 KOB1 KOB2 KOB3 KOB4
## ENSG00000223972 0 0 0 0 0 0 0 0 0 0 0 0
## ENSG00000227232 11 11 11 10 4 7 5 4 5 5 8 10
## ENSG00000278267 1 0 1 0 1 1 0 0 2 2 0 0
## ENSG00000243485 1 1 0 0 1 0 0 0 1 0 1 0
## ENSG00000284332 0 0 0 0 0 0 0 0 0 0 0 0
## ENSG00000237613 0 0 0 0 0 0 0 0 0 0 0 0
rowData(dds) %>% head
## DataFrame with 6 rows and 0 columns
# test what counts () returns
counts(dds) %>% str
## int [1:60623, 1:12] 0 11 1 1 0 0 0 0 0 1 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:60623] "ENSG00000223972" "ENSG00000227232" "ENSG00000278267" "ENSG00000243485" ...
## ..$ : chr [1:12] "WT1" "WT2" "WT3" "WT4" ...
Filter (this is very similar to what would have been done above but you can do it the deseq2 dataframe instead)
Remove any genes where the expresison is < 1
nrow(dds)
## [1] 60623
keep <- rowSums(counts(dds)) > 1
dds <- dds[keep,]
nrow(dds)
## [1] 32385
# For Control vs KOA
nrow(dds_KOA)
## [1] 60623
keep <- rowSums(counts(dds_KOA)) > 1
dds_KOA <- dds_KOA[keep,]
nrow(dds_KOA)
## [1] 30207
# For Control vs KOB
nrow(dds_KOB)
## [1] 60623
keep <- rowSums(counts(dds_KOB)) > 1
dds_KOB <- dds_KOB[keep,]
nrow(dds_KOB)
## [1] 30572
Investigate different library sizes
colSums(counts(dds))
## WT1 WT2 WT3 WT4 KOA1 KOA2 KOA3 KOA4
## 11013320 13515042 12062073 13108261 15893497 16272781 11457237 11717676
## KOB1 KOB2 KOB3 KOB4
## 17162105 10317126 13348451 12118735
colSums(read.counts)
## WT1 WT2 WT3 WT4 KOA1 KOA2 KOA3 KOA4
## 11013728 13515479 12062470 13108797 15893921 16273139 11457524 11718005
## KOB1 KOB2 KOB3 KOB4
## 17162544 10317625 13348800 12119009
DESeq2’s default method to normalize read counts to account for differences in sequencing depths is implemented in estimateSizeFactors()
dds <- estimateSizeFactors(dds)
sizeFactors(dds)
## WT1 WT2 WT3 WT4 KOA1 KOA2 KOA3 KOA4
## 0.8522898 1.0480173 0.9210087 0.9972124 1.1446152 1.1913074 0.8907589 0.9322756
## KOB1 KOB2 KOB3 KOB4
## 1.3593630 0.8587181 1.0661527 0.9712663
If you check colData () again , you see that this now contains the sizeFactors
colData(dds)
## DataFrame with 12 rows and 6 columns
## condition sample rep genotype cellline sizeFactor
## <factor> <factor> <factor> <factor> <factor> <numeric>
## WT1 Control WT1 1 WT 0 0.852289801791534
## WT2 Control WT2 2 WT 0 1.04801734850232
## WT3 Control WT3 3 WT 0 0.921008729936671
## WT4 Control WT4 4 WT 0 0.997212359449115
## KOA1 KOA KOA1 1 KO 1 1.14461524344672
## ... ... ... ... ... ... ...
## KOA4 KOA KOA4 4 KO 1 0.932275605735372
## KOB1 KOB KOB1 1 KO 2 1.35936301971261
## KOB2 KOB KOB2 2 KO 2 0.858718111825033
## KOB3 KOB KOB3 3 KO 2 1.06615265598051
## KOB4 KOB KOB4 4 KO 2 0.971266300266936
Counts() allows you to immediately retrieve the _normalized_read counts
norm <- counts(dds, normalized=TRUE)
Control vs KOA DESeq2’s default method to normalize read counts to account for differences in sequencing depths is implemented in estimateSizeFactors()
dds_KOA <- estimateSizeFactors(dds_KOA)
sizeFactors(dds_KOA)
## WT1 WT2 WT3 WT4 KOA1 KOA2 KOA3 KOA4
## 0.8642711 1.0671616 0.9401645 1.0150429 1.1675653 1.2107008 0.9077720 0.9488105
# if you check colData () again , you see that this now contains the sizeFactors
colData(dds_KOA)
## DataFrame with 8 rows and 6 columns
## condition sample rep genotype cellline sizeFactor
## <factor> <factor> <factor> <factor> <factor> <numeric>
## WT1 Control WT1 1 WT 0 0.864271075329799
## WT2 Control WT2 2 WT 0 1.06716159577229
## WT3 Control WT3 3 WT 0 0.940164530728474
## WT4 Control WT4 4 WT 0 1.01504288107225
## KOA1 KOA KOA1 1 KO 1 1.16756529285446
## KOA2 KOA KOA2 2 KO 1 1.21070079387231
## KOA3 KOA KOA3 3 KO 1 0.907771981531634
## KOA4 KOA KOA4 4 KO 1 0.948810489057759
# counts () allows you to immediately retrieve the _normalized_read counts
norm_KOA <- counts(dds_KOA, normalized=TRUE)
Control vs KOB DESeq2’s default method to normalize read counts to account for differences in sequencing depths is implemented in estimateSizeFactors()
dds_KOB <- estimateSizeFactors(dds_KOB)
sizeFactors(dds_KOB)
## WT1 WT2 WT3 WT4 KOB1 KOB2 KOB3 KOB4
## 0.8590223 1.0563523 0.9269907 1.0064413 1.3705430 0.8729200 1.0741463 0.9790657
# if you check colData () again , you see that this now contains the sizeFactors
colData(dds_KOB)
## DataFrame with 8 rows and 6 columns
## condition sample rep genotype cellline sizeFactor
## <factor> <factor> <factor> <factor> <factor> <numeric>
## WT1 Control WT1 1 WT 0 0.85902232819576
## WT2 Control WT2 2 WT 0 1.05635227582385
## WT3 Control WT3 3 WT 0 0.926990686134875
## WT4 Control WT4 4 WT 0 1.0064413010846
## KOB1 KOB KOB1 1 KO 1 1.37054295025715
## KOB2 KOB KOB2 2 KO 1 0.872920027845259
## KOB3 KOB KOB3 3 KO 1 1.07414625588402
## KOB4 KOB KOB4 4 KO 1 0.979065713615552
# counts () allows you to immediately retrieve the _normalized_read counts
norm_KOB <- counts(dds_KOB, normalized=TRUE)
Downstream analyses (including clustering) work much better if the read counts are transformed to the log scale following normalization.
Transform size-factor normalized read counts to log2 scale using a pseudocount of 1
log.norm.counts <- log2(norm + 1)
can also use this command
ntd <- normTransform(dds)
Control vs KOA
# Transform size - factor normalized read counts to log2 scale using a pseudocount of 1
log.norm.counts.KOA <- log2(norm_KOA + 1)
Control vs KOB
# Transform size - factor normalized read counts to log2 scale using a pseudocount of 1
log.norm.counts.KOB <- log2(norm_KOB + 1)
Plotting the transformation
par(mfrow =c(3 , 1)) # to plot the following two images underneath each other
# first, plot the normalised data: non-transformed
boxplot(norm, notch = TRUE ,
main = "Untransformed read counts ", ylab = "read counts")
# second, plot the transformed normalised data
boxplot(log.norm.counts, notch = TRUE ,
main = "Log2 - transformed read counts ",
ylab = " log2 (read counts)")
# this should give exactly the same as log.norm.counts plot
boxplot(assay(ntd), notch = TRUE ,
main = "Log2 - transformed read counts ",
ylab = " log2 (read counts)")
Plot the counts in a pairwise manner
plot(log.norm.counts[ ,1:2] , cex =.1 , main = " Normalized log2 ( read counts )")
Check for heteroscedascity Many statistical tests and analyses assume that data is homoskedastic, i.e. that all variables have similar variance. However, data with large differences among the sizes of the individual observations often shows heteroskedastic behavior. One way to visually check for heteroskedasticity is to plot the mean vs. the standard deviation
# BiocManager::install("vsn")
library("vsn")
library(ggplot2 )
msd_plot <- meanSdPlot(log.norm.counts,
ranks =FALSE , # show the data on the original scale
plot = FALSE )
msd_plot$gg +
ggtitle ("Sequencing depth normalized log2 (read counts )") +
ylab ("Standard Deviation ")
msd_plot <- meanSdPlot(assay(ntd),
ranks =FALSE , # show the data on the original scale
plot = FALSE )
msd_plot$gg +
ggtitle ("Sequencing depth normalized log2 (read counts )") +
ylab ("Standard Deviation ")
meanSdPlot(assay(ntd))
The y-axis shows the variance of the read counts across all samples. Some variability is, in fact, expected, but a clear hump on the left-hand side indicates that for read counts < 32 (2^5 = 32), the variance is higher than for those with greater read counts. That means that there is a dependence of the variance on the mean, which violates the assumption of homoskedasticity.
Shrink the variance of low read counts For RNA-seq counts, however, the expected variance grows with the mean. A simple and often used strategy to avoid this is to take the logarithm of the normalized count values plus a pseudocount of 1; however, depending on the choice of pseudocount, now the genes with the very lowest counts will contribute a great deal of noise to the resulting plot, because taking the logarithm of small counts actually inflates their variance. As a solution, DESeq2 offers two transformations for count data that stabilize the variance across the mean: the variance stabilizing transformation (VST) for negative binomial data with a dispersion-mean trend (Anders and Huber 2010), implemented in the vst function, and the regularized-logarithm transformation or rlog (Love, Huber, and Anders 2014). For genes with high counts, both the VST and the rlog will give similar result to the ordinary log2 transformation of normalized counts. For genes with lower counts, however, the values are shrunken towards a middle value. The VST or rlog-transformed data then become approximately homoskedastic (more flat trend in the meanSdPlot), and can be used directly for computing distances between samples, making PCA plots, or as input to downstream methods which perform best with homoskedastic data.
R log transformation DESeq2’s rlog() function returns values that are both normalized for sequencing depth and transformed to the log2 scale where the values are adjusted to it the experiment-wide trend of the variance-mean relationship blind = FALSE means that differences between cell lines and treatment (the variables in the design) will not contribute to the expected variance-mean trend of the experiment. The experimental design is not used directly in the transformation, only in estimating the global amount of variability in the counts. The rlog() function’s blind parameter should be set to FALSE if the different conditions lead to strong differences in a large proportion of the genes. If rlog() is applied without incorporating the knowledge of the experimental design (blind = TRUE, the default setting), the dispersion will be greatly overestimated in such cases.
rld <- rlogTransformation(dds, blind = TRUE)
rldMatrix <- data.matrix(assay(rld))
head(assay(rld), 3)
## WT1 WT2 WT3 WT4 KOA1
## ENSG00000227232 3.0629219 2.9984253 3.0384135 2.9825290 2.7495646
## ENSG00000278267 -0.6272881 -0.6780645 -0.6302857 -0.6775416 -0.6384064
## ENSG00000243485 -1.2396781 -1.2435720 -1.2788105 -1.2803079 -1.2452328
## KOA2 KOA3 KOA4 KOB1 KOB2
## ENSG00000227232 2.8408587 2.8364707 2.788230 2.7501142 2.8443326
## ENSG00000278267 -0.6398471 -0.6763309 -0.676823 -0.6094799 -0.5811246
## ENSG00000243485 -1.2836577 -1.2781815 -1.279040 -1.2501325 -1.2774915
## KOB3 KOB4
## ENSG00000227232 2.9001463 2.9902961
## ENSG00000278267 -0.6782435 -0.6772616
## ENSG00000243485 -1.2438952 -1.2798113
## Control vs KOA
rld_KOA <- rlogTransformation(dds_KOA, blind = TRUE)
rldMatrixKOA <- data.matrix(assay(rld_KOA))
head(assay(rld_KOA), 3)
## WT1 WT2 WT3 WT4 KOA1 KOA2
## ENSG00000227232 3.061713 3.015667 3.043015 3.005031 2.850328 2.909949
## ENSG00000278267 -1.027827 -1.057046 -1.028949 -1.056623 -1.032966 -1.033726
## ENSG00000243485 -1.357739 -1.360052 -1.380602 -1.381442 -1.361038 -1.383376
## KOA3 KOA4
## ENSG00000227232 2.907350 2.875898
## ENSG00000278267 -1.055135 -1.055724
## ENSG00000243485 -1.380217 -1.380702
## Control vs KOB
rld_KOB <- rlogTransformation(dds_KOB, blind = TRUE)
rldMatrixKOB <- data.matrix(assay(rld_KOB))
head(assay(rld_KOB), 3)
## WT1 WT2 WT3 WT4 KOB1
## ENSG00000227232 3.2354075 3.182845 3.2157180 3.1707010 2.9919523
## ENSG00000278267 -0.4362341 -0.473189 -0.4385083 -0.4728152 -0.4241705
## ENSG00000243485 -1.0323291 -1.035790 -1.0641455 -1.0654057 -1.0420624
## KOB2 KOB3 KOB4
## ENSG00000227232 3.0651051 3.1070436 3.1772826
## ENSG00000278267 -0.4028131 -0.4733165 -0.4725995
## ENSG00000243485 -1.0632245 -1.0362130 -1.0649830
Plotting the rlog transformation
msd_rlog_plot <- meanSdPlot(assay(rld),
ranks =FALSE, # show the data on the original scale
plot = FALSE )
msd_rlog_plot$gg +
ggtitle ("rlog - transformed read counts") +
ylab (" standard deviation ")
Export the rlog counts
#write.table(rldMatrixKOB, "../Control vs KOB/RLogCounts_wtvskob.txt", row.names=TRUE, col.names=TRUE, sep="\t", quote=FALSE)
The variance stabilizing transformation
# Control vs KO
vsd <- vst(dds, blind = TRUE)
head(assay(vsd), 3)
## WT1 WT2 WT3 WT4 KOA1 KOA2 KOA3
## ENSG00000227232 5.750998 5.658701 5.715364 5.639475 5.292552 5.442049 5.427347
## ENSG00000278267 5.078382 4.780859 5.067105 4.780859 5.037709 5.032638 4.780859
## ENSG00000243485 5.078382 5.049253 4.780859 4.780859 5.037709 4.780859 4.780859
## KOA4 KOB1 KOB2 KOB3 KOB4
## ENSG00000227232 5.347172 5.305677 5.439096 5.526275 5.650537
## ENSG00000278267 4.780859 5.113876 5.199317 4.780859 4.780859
## ENSG00000243485 4.780859 5.016598 4.780859 5.046967 4.780859
colData(vsd)
## DataFrame with 12 rows and 6 columns
## condition sample rep genotype cellline sizeFactor
## <factor> <factor> <factor> <factor> <factor> <numeric>
## WT1 Control WT1 1 WT 0 0.852289801791534
## WT2 Control WT2 2 WT 0 1.04801734850232
## WT3 Control WT3 3 WT 0 0.921008729936671
## WT4 Control WT4 4 WT 0 0.997212359449115
## KOA1 KOA KOA1 1 KO 1 1.14461524344672
## ... ... ... ... ... ... ...
## KOA4 KOA KOA4 4 KO 1 0.932275605735372
## KOB1 KOB KOB1 1 KO 2 1.35936301971261
## KOB2 KOB KOB2 2 KO 2 0.858718111825033
## KOB3 KOB KOB3 3 KO 2 1.06615265598051
## KOB4 KOB KOB4 4 KO 2 0.971266300266936
vsd.norm.counts <- assay(vsd)
vsdMatrix <- data.matrix(assay(vsd))
# Control vs KOA
vsd_KOA <- vst(dds_KOA, blind = TRUE)
head(assay(vsd_KOA), 3)
## WT1 WT2 WT3 WT4 KOA1 KOA2 KOA3
## ENSG00000227232 6.332780 6.260976 6.303260 6.246483 5.984154 6.097431 6.085673
## ENSG00000278267 5.824571 5.602130 5.815421 5.602130 5.793560 5.790124 5.602130
## ENSG00000243485 5.824571 5.802349 5.602130 5.602130 5.793560 5.602130 5.602130
## KOA4
## ENSG00000227232 6.025628
## ENSG00000278267 5.602130
## ENSG00000243485 5.602130
colData(vsd_KOA)
## DataFrame with 8 rows and 6 columns
## condition sample rep genotype cellline sizeFactor
## <factor> <factor> <factor> <factor> <factor> <numeric>
## WT1 Control WT1 1 WT 0 0.864271075329799
## WT2 Control WT2 2 WT 0 1.06716159577229
## WT3 Control WT3 3 WT 0 0.940164530728474
## WT4 Control WT4 4 WT 0 1.01504288107225
## KOA1 KOA KOA1 1 KO 1 1.16756529285446
## KOA2 KOA KOA2 2 KO 1 1.21070079387231
## KOA3 KOA KOA3 3 KO 1 0.907771981531634
## KOA4 KOA KOA4 4 KO 1 0.948810489057759
vsd.norm.counts.KOA <- assay(vsd_KOA)
vsdMatrixKOA <- data.matrix(assay(vsd_KOA))
# Control vs KOB
vsd_KOB <- vst(dds_KOB, blind = TRUE)
head(assay(vsd_KOB), 3)
## WT1 WT2 WT3 WT4 KOB1 KOB2 KOB3
## ENSG00000227232 6.086511 6.007654 6.056586 5.990799 5.707764 5.818535 5.895031
## ENSG00000278267 5.515841 5.265148 5.506497 5.265148 5.545739 5.616422 5.265148
## ENSG00000243485 5.515841 5.491269 5.265148 5.265148 5.463712 5.265148 5.489392
## KOB4
## ENSG00000227232 6.000663
## ENSG00000278267 5.265148
## ENSG00000243485 5.265148
colData(vsd_KOB)
## DataFrame with 8 rows and 6 columns
## condition sample rep genotype cellline sizeFactor
## <factor> <factor> <factor> <factor> <factor> <numeric>
## WT1 Control WT1 1 WT 0 0.85902232819576
## WT2 Control WT2 2 WT 0 1.05635227582385
## WT3 Control WT3 3 WT 0 0.926990686134875
## WT4 Control WT4 4 WT 0 1.0064413010846
## KOB1 KOB KOB1 1 KO 1 1.37054295025715
## KOB2 KOB KOB2 2 KO 1 0.872920027845259
## KOB3 KOB KOB3 3 KO 1 1.07414625588402
## KOB4 KOB KOB4 4 KO 1 0.979065713615552
vsd.norm.counts.KOB <- assay(vsd_KOB)
vsdMatrixKOB <- data.matrix(assay(vsd_KOB))
# Plotting the vsd normalisation
msd_vsd_plot <- meanSdPlot(assay(vsd),
ranks =FALSE, # show the data on the original scale
plot = FALSE )
msd_vsd_plot$gg +
ggtitle ("vsd - transformed read counts ") +
ylab (" standard deviation ")
meanSdPlot(assay(vsd))
#write.table(vsdMatrix_KOB, "../Control vs KOB/vsdCounts_wtvskob.txt", row.names=TRUE, col.names=TRUE, sep="\t", quote=FALSE)
Output dispersion plot Need to have estimated the dispersion distance to plot this
options(scipen=999)
dds <- estimateDispersions(dds)
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
dds_KOA <- estimateDispersions(dds_KOA)
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
dds_KOB <- estimateDispersions(dds_KOB)
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
par(mfrow =c(1 , 1)) # to plot the following two images underneath each other
options(scipen=999)
plotDispEsts(dds, genecol="black", fitcol="red", finalcol="dodgerblue", legend=TRUE, log="xy", cex.axis=0.8, cex=0.3, cex.main=0.8, xlab="Mean of normalised counts", ylab="Dispersion")
options(scipen=0)
Histograms to check normalisation methods
hist(norm, breaks=100, xlab="Counts", col="grey", main="Normalised counts")
hist(norm, breaks=10000, xlab="Counts", xlim=c(0,2500), col="grey", main="Normalised counts\n(zoomed range 0:2500)")
hist(log2(norm + 1), breaks=10, xlab="Counts", col="grey", main=bquote(~Log[2]~normalised~counts))
hist(rldMatrix, xlab="Counts", breaks=50, col="grey", main="Regularised log counts")
hist(vsdMatrix, xlab="Counts", col="grey", main="Variance Stablised counts")
Boxplots to check normalisation
# par(mar=c(3,3,3,3), mfrow=c(5,1), cex=1, cex.axis=0.8)
boxplot(norm, main="Normalised counts", xlab="", ylab="Normalised counts", names=paste(sample_info$sample), col=col.sample, las=2)
boxplot(log2(norm+1), main="Log2 + 1 Normalised counts", xlab="", ylab="Log2 +1 Normalised counts", names=paste(sample_info$sample), col=col.sample, las=2)
boxplot(rldMatrix, main="Regularised log counts", xlab="", ylab="Regularised log counts", names=paste(sample_info$sample), col=col.sample, las=2)
boxplot(rldMatrix, main="Regularised log counts\n(outlier genes removed)", xlab="", ylab="Regularised log counts", names=paste(sample_info$sample), col=col.sample, las=2, outline=FALSE)
boxplot(vsdMatrix, main="Variance Stablised counts", xlab="", ylab="Variance Stablised counts", names=paste(sample_info$sample), col=col.sample, las=2)
boxplot(vsdMatrix, main="Variance Stablised counts\n(outlier genes removed)", xlab="", ylab="Variance Stablised counts", names=paste(sample_info$sample), col=col.sample, las=2, outline=FALSE)
Plotting the normalisation methods on scatter
library("dplyr")
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:matrixStats':
##
## count
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:GenomicRanges':
##
## intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
##
## intersect
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("ggplot2")
norm_methods <- bind_rows(
as_tibble(log.norm.counts) %>%
mutate(transformation = "log2(x + 1)"),
as_data_frame(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"),
as_data_frame(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"))
## Warning: `as_data_frame()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
colnames(norm_methods)[1:2] <- c("x", "y")
ggplot(norm_methods, aes(x = x, y = y)) + geom_hex(bins = 80) +
coord_fixed() + facet_grid( . ~ transformation)
Calculating distance between samples Using the R log normalisation
Default method for calculating these distances is Euclidean
sampleDists_rld <- dist(t(assay(rld)))
sampleDists_rld
## WT1 WT2 WT3 WT4 KOA1 KOA2 KOA3 KOA4
## WT2 17.86779
## WT3 21.89118 17.47844
## WT4 21.74939 17.45521 16.77447
## KOA1 46.21829 44.57613 43.54028 43.14949
## KOA2 45.21990 44.16379 43.69056 43.33033 15.10727
## KOA3 46.15285 46.68972 47.62597 47.38183 30.82277 29.58662
## KOA4 45.39504 46.12170 47.52792 47.34764 31.98541 30.55019 16.80898
## KOB1 43.41784 42.93969 43.38698 43.43898 37.73391 36.78960 37.64273 37.50792
## KOB2 50.92582 52.32128 54.26172 54.29776 54.62004 53.38077 45.19742 43.71894
## KOB3 46.26449 45.69808 46.00895 45.61349 43.69937 43.26115 36.05568 35.81720
## KOB4 42.76047 43.62720 45.63596 45.56841 42.91343 41.45739 36.01510 35.25560
## KOB1 KOB2 KOB3
## WT2
## WT3
## WT4
## KOA1
## KOA2
## KOA3
## KOA4
## KOB1
## KOB2 37.11127
## KOB3 24.93337 33.25426
## KOB4 21.96729 31.30812 21.22163
## Control vs KOA
sampleDists_rld_KOA <- dist(t(assay(rld_KOA)))
sampleDists_rld_KOA
## WT1 WT2 WT3 WT4 KOA1 KOA2 KOA3
## WT2 14.55641
## WT3 18.62832 14.04608
## WT4 18.42805 13.95896 13.16940
## KOA1 40.38512 38.65553 37.63306 37.21047
## KOA2 39.37564 38.26631 37.81329 37.40644 11.83826
## KOA3 39.92967 40.32927 41.25708 40.93900 26.71598 25.43932
## KOA4 39.29283 39.86617 41.25663 41.00103 27.93115 26.53343 13.33455
## Control vs KOB
sampleDists_rld_KOB <- dist(t(assay(rld_KOB)))
sampleDists_rld_KOB
## WT1 WT2 WT3 WT4 KOB1 KOB2 KOB3
## WT2 15.54566
## WT3 19.61363 15.06737
## WT4 19.41664 14.98661 14.26220
## KOB1 39.55089 39.00689 39.45828 39.44975
## KOB2 46.42924 47.76657 49.67646 49.66145 33.92650
## KOB3 42.21763 41.57151 41.87386 41.43239 22.35779 30.26227
## KOB4 38.83541 39.63255 41.59234 41.47047 19.50730 28.33894 18.88365
#Save the distances between samples to table
sampleDistMatrix_rld <- as.matrix(sampleDists_rld)
sampleDistMatrix_rld_KOA <- as.matrix(sampleDists_rld_KOA)
sampleDistMatrix_rld_KOB <- as.matrix(sampleDists_rld_KOB)
#write.table(as.matrix(sampleDists_rld_KOB), "../Control vs KOB/SampleDistance_rlog_wtvskob.txt", row.names=T, col.names=T, sep="\t", quote=F)
## Using the vsd normalisation
sampleDists_vsd <- dist(t(assay(vsd)))
sampleDists_vsd
## WT1 WT2 WT3 WT4 KOA1 KOA2 KOA3 KOA4
## WT2 35.08239
## WT3 39.89341 34.43582
## WT4 39.66447 34.47077 34.42618
## KOA1 67.54574 64.56907 63.89095 63.18283
## KOA2 65.92732 63.73149 63.84238 63.07745 30.33523
## KOA3 68.25122 67.91308 70.00429 69.48677 48.35801 46.46797
## KOA4 67.14786 67.15581 69.81369 69.29913 49.72929 47.54291 33.64751
## KOB1 63.08197 61.57004 62.85913 62.56769 54.78532 53.11815 55.63546 55.43905
## KOB2 74.62813 75.56887 78.69000 78.28852 78.00529 76.06957 66.99056 64.87341
## KOB3 67.59920 65.82584 67.07623 66.24812 63.09950 62.15453 54.31258 53.97149
## KOB4 63.50732 63.75258 67.07885 66.61596 62.34412 60.17832 54.65819 53.63816
## KOB1 KOB2 KOB3
## WT2
## WT3
## WT4
## KOA1
## KOA2
## KOA3
## KOA4
## KOB1
## KOB2 55.60700
## KOB3 40.19068 51.88815
## KOB4 37.28845 49.80301 37.17433
Heat map to visualise distance between samples uses the normalised count matrix
library("pheatmap")
library("RColorBrewer")
# call the row names and the column name
rownames(sampleDistMatrix_rld) <- paste(rld$sample)
colnames(sampleDistMatrix_rld) <- paste(rld$sample)
# change the cluster method to ward.D2
hc <- hclust(sampleDists_rld, method="ward.D2")
# create colours
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
mycols <- brewer.pal(3, "Blues")[1:length(unique(sample_info$condition))]
# change the order to make WT be the first sample
callback = function(hc, mat){
sv = svd(t(mat))$v[,1]
dend = reorder(as.dendrogram(hc), wts = sv)
as.hclust(dend)
}
# plot heat map
pheatmap(sampleDistMatrix_rld,
clustering_distance_rows = sampleDists_rld,
clustering_distance_cols = sampleDists_rld,
col = colors,
clustering_callback = callback)
Heat map using gplots
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:IRanges':
##
## space
## The following object is masked from 'package:S4Vectors':
##
## space
## The following object is masked from 'package:stats':
##
## lowess
heatmap.2(as.matrix(sampleDists_rld), key=T, trace="none",
col=colors,
Rowv=as.dendrogram(hc),
cexRow = 0.6, cexCol=0.6,
symm=TRUE,
ColSideColors=mycols[sample_info$condition], RowSideColors=mycols[sample_info$condition],
margin=c(5, 5), main="Sample Distance Matrix", key.title="Sample similarity", key.xlab="Euclidean distance", key.ylab="")
Plot distibution and density with violin plots
violinMatrix <- reshape2::melt(rldMatrix, id.vars=NULL)
colnames(violinMatrix) <- c("Gene","Sample","Expression")
library(ggplot2)
ggplot(violinMatrix, aes(x=Sample, y=Expression)) + geom_violin() + theme(axis.text.x = element_text(angle=45, hjust=1), axis.line= element_line(colour = "black"),panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank())
Dendogram cor () calculates the correlation between columns of a matrix
Calculate Pearson’s correlation distance
distance.rld <- as.dist(1 - cor(assay(rld), method = "pearson" ))
distance.rld.KOA <- as.dist(1 - cor(assay(rld_KOA), method = "pearson" ))
distance.rld.KOB <- as.dist(1 - cor(assay(rld_KOB), method = "pearson" ))
plot () can directly interpret the output of hclust()
par(cex=1.0, cex.axis=0.8, cex.main=0.8)
plot(hclust(distance.rld),
labels = colnames(rld),
main = "rlog transformed Read Counts Distance: Pearson Correlation")
Circular dendrogram and regular dendrogram
distmatrix <- dist(t(rldMatrix), method="euclidean")
hclustObject <- hclust(distmatrix, method="ward.D2")
dend <- as.dendrogram(hclustObject)
plot(dend, ylab="Height", main="rlog transformed Read Counts Distance: Euclidean Distance")
Circular dendrogram
library(circlize)
## ========================================
## circlize version 0.4.8
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: http://jokergoo.github.io/circlize_book/book/
##
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
## in R. Bioinformatics 2014.
## ========================================
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.13.4
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
#Get the heights for each branch
heights <- round(get_branches_heights(dend, sort=FALSE), 1)
#Get max height
maxHeight= max(heights)
#Set label and dendrogram height for cicular dendrogram
labelHeight=0.1
dendHeight=0.8
labels(dend) <- gsub("\\.[0-9]$", "", labels(dend))
#Draw the circular dendrogram
circlize_dendrogram(dend, facing="outside", labels=TRUE, labels_track_height=labelHeight, dend_track_height=dendHeight, cex=0.6)
#Create tick co-ordinates and values for the new axis
#We have to enure that we don't overlap the label plot region (height specified by labelHeight), nor the central region of the plot (1-(dendHeight+labelHeight))
ticks <- seq(from=(1-(dendHeight+labelHeight)), to=(1-labelHeight), length.out=5)
values <- round(rev(seq(from=0, to=maxHeight, length.out=5)), 1)
#Add the new axis
library(plotrix)
##
## Attaching package: 'plotrix'
## The following object is masked from 'package:gplots':
##
## plotCI
ablineclip(h=0, v=ticks, col="black", x1=1-(dendHeight+labelHeight), x2=1-labelHeight, y1=0, y2=0.04, lwd=1.5)
text(ticks, 0+0.08, values, cex=0.8)
text((1-labelHeight)-(((1-labelHeight)-(1-(dendHeight+labelHeight)))/2), 0+0.14, "Height")
Scatterplots Perform pairwise scatter plots on the samples
#library(car)
#scatterplotMatrix(rldMatrix[,c("WT1","WT2","WT3","WT4")], diagonal="boxplot", pch=".")
#scatterplotMatrix(rldMatrix[,c("KOA1","KOA2","KOA3","KOA4")], diagonal="boxplot", pch=".")
#scatterplotMatrix(rldMatrix[,c("WT1","WT2","WT3","WT4","WT4","KOA1","KOA2","KOA3","KOA4","KOB1","KOB2", "KOB3","KOB4")], diagonal="boxplot", pch=".")
PCA analysis using R function
project.pca <- prcomp(t(assay(rld)))
plot(project.pca$x[,1], project.pca$x[,2],
col = col.sample,
xlab="PC1",
ylab="PC2",
main = "PCA of seq.depth normalized \n and rlog - transformed read counts")
rownames(rldMatrix) <- rownames(dds)
# Control vs KOA
project.pca.KOA <- prcomp(t(assay(rld_KOA)))
# Control vs KOB
project.pca.KOB <- prcomp(t(assay(rld_KOB)))
Accessing the PCA results
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.6.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Eigenvalues
eig.val <- get_eigenvalue(project.pca)
eig.val
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 3.510358e+02 4.308818e+01 43.08818
## Dim.2 2.279075e+02 2.797469e+01 71.06287
## Dim.3 8.037408e+01 9.865583e+00 80.92846
## Dim.4 4.871433e+01 5.979481e+00 86.90794
## Dim.5 3.012211e+01 3.697363e+00 90.60530
## Dim.6 1.764331e+01 2.165643e+00 92.77094
## Dim.7 1.326953e+01 1.628779e+00 94.39972
## Dim.8 1.264067e+01 1.551590e+00 95.95131
## Dim.9 1.229353e+01 1.508980e+00 97.46029
## Dim.10 1.090043e+01 1.337983e+00 98.79827
## Dim.11 9.790360e+00 1.201726e+00 100.00000
## Dim.12 3.552459e-27 4.360495e-28 100.00000
#results for variable
res.var <- get_pca_var(project.pca)
head(res.var$contrib, n=3) # Contributions to the PCs
## Dim.1 Dim.2 Dim.3 Dim.4
## ENSG00000227232 1.937062e-03 9.078206e-04 2.992322e-04 9.863817e-04
## ENSG00000278267 2.703278e-06 3.271651e-05 3.301511e-05 1.275012e-03
## ENSG00000243485 3.610077e-06 2.041188e-06 3.627546e-05 1.298099e-05
## Dim.5 Dim.6 Dim.7 Dim.8
## ENSG00000227232 4.372805e-04 1.001396e-02 1.940705e-03 3.166447e-03
## ENSG00000278267 6.772119e-05 7.279686e-04 9.054295e-05 7.636901e-05
## ENSG00000243485 2.442738e-05 7.559114e-06 3.948326e-04 6.692067e-05
## Dim.9 Dim.10 Dim.11 Dim.12
## ENSG00000227232 0.0005852543 8.149232e-04 2.947250e-03 0.0005619406
## ENSG00000278267 0.0014979652 4.950499e-05 1.020601e-05 1.2145557923
## ENSG00000243485 0.0002051001 1.058579e-03 7.284814e-04 0.2149089544
head(res.var$coord, n=3)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## ENSG00000227232 -0.082460784 0.045486165 0.015508229 -0.021920521 -0.011476850
## ENSG00000278267 0.003080499 0.008635009 -0.005151271 0.024922152 -0.004516531
## ENSG00000243485 -0.003559868 0.002156854 -0.005399636 -0.002514677 -0.002712571
## Dim.6 Dim.7 Dim.8 Dim.9 Dim.10
## ENSG00000227232 0.04203325 0.016047506 0.020006505 -0.008482242 0.009424975
## ENSG00000278267 -0.01133304 -0.003466211 0.003107018 -0.013570292 0.002322985
## ENSG00000243485 0.00115485 -0.007238261 -0.002908475 -0.005021360 -0.010741962
## Dim.11 Dim.12
## ENSG00000227232 0.0169866526 1.412895e-16
## ENSG00000278267 -0.0009996025 -6.568607e-15
## ENSG00000243485 -0.0084451731 2.763069e-15
# results for samples
res.ind <- get_pca_ind(project.pca)
head(res.ind$coord, n=3) # Coordinates
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## WT1 -23.22899 5.543174 6.118345 0.06918094 -9.254330 2.8398988 -5.772496
## WT2 -24.80668 4.003650 2.259764 -0.46153624 -1.810968 0.1437868 2.995273
## WT3 -25.70757 1.996612 -1.194368 -0.39323997 5.229398 -3.1874656 6.589442
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12
## WT1 0.7554927 -4.913642 0.6087923 -0.943857537 5.749226e-14
## WT2 -2.3520797 3.800862 -7.9465461 -0.001192834 5.403147e-14
## WT3 2.1636374 -5.496651 2.4464752 0.535316495 5.149342e-14
head(res.ind$contrib, n=3) # Contributions to the PCs
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## WT1 12.80937 1.1235107 3.8812410 0.000818719 23.6931827 3.809293644 20.926207
## WT2 14.60847 0.5861009 0.5294550 0.036439603 0.9073088 0.009765099 5.634249
## WT3 15.68878 0.1457630 0.1479038 0.026453146 7.5654859 4.798767624 27.268461
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12
## WT1 0.3762782 16.366243 0.2833436 0.7582859389 7.753690
## WT2 3.6471411 9.792784 48.2760616 0.0000012111 6.848307
## WT3 3.0861530 20.480400 4.5756906 0.2439166000 6.220040
head(res.ind$cos2, n=3) # Quality of representation
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6
## WT1 0.7092748 0.040389746 0.049206437 6.291105e-06 0.11257553 1.060132e-02
## WT2 0.8405352 0.021894285 0.006975010 2.909579e-04 0.00447961 2.823946e-05
## WT3 0.8381044 0.005055492 0.001809059 1.961063e-04 0.03468004 1.288449e-02
## Dim.7 Dim.8 Dim.9 Dim.10 Dim.11 Dim.12
## WT1 0.04380072 0.0007502647 0.03173664 0.0004871833 1.171027e-03 4.344830e-30
## WT2 0.01225438 0.0075565328 0.01973252 0.0862532223 1.943473e-09 3.987604e-30
## WT3 0.05506473 0.0059367007 0.03831533 0.0075902808 3.634105e-04 3.362635e-30
# results of PCs for samples
project.pca$x
## PC1 PC2 PC3 PC4 PC5 PC6
## WT1 -23.228986 5.543174 6.118345 0.06918094 -9.2543301 2.8398988
## WT2 -24.806684 4.003650 2.259764 -0.46153624 -1.8109683 0.1437868
## WT3 -25.707565 1.996612 -1.194368 -0.39323997 5.2293981 -3.1874656
## WT4 -25.535127 1.662806 -1.270357 -0.68606846 5.9286554 -0.1461733
## KOA1 4.928790 -23.998438 -8.091365 6.69717509 2.2642584 2.5213560
## KOA2 5.693606 -23.049528 -6.743927 6.80816996 -1.5377202 2.7995414
## KOA3 14.632433 -13.478320 12.624572 -6.04404468 0.6607942 -2.8169145
## KOA4 14.587299 -11.790398 14.037865 -5.13124135 -1.1018168 -2.3107256
## KOB1 11.416497 7.684555 -15.373007 -1.78008264 -5.0591125 -9.6133057
## KOB2 18.648333 24.668419 8.624546 15.93293731 2.7639463 -0.6137804
## KOB3 15.170878 13.263970 -5.587675 -9.87786553 9.1959234 4.2873393
## KOB4 14.200527 13.493500 -5.404394 -5.13338443 -7.2790279 6.0964427
## PC7 PC8 PC9 PC10 PC11 PC12
## WT1 -5.77249572 0.7554927 -4.91364155 0.60879233 -0.943857537 5.749226e-14
## WT2 2.99527287 -2.3520797 3.80086157 -7.94654612 -0.001192834 5.403147e-14
## WT3 6.58944248 2.1636374 -5.49665139 2.44647520 0.535316495 5.149342e-14
## WT4 -3.98698663 -0.4448173 6.61116625 4.92486428 0.462968309 2.709912e-14
## KOA1 -0.07826683 -0.2109891 -0.53325524 -0.55164210 -7.111941356 8.137417e-14
## KOA2 -0.26297294 -0.1287104 -0.56406390 -0.32941627 7.389527724 8.956381e-14
## KOA3 -0.88352517 7.9117579 1.72911827 -1.56595304 -0.049524194 8.497752e-14
## KOA4 1.45023303 -7.7951403 -0.62842084 2.40456549 -0.194873543 6.352729e-14
## KOB1 -2.04398058 -0.5919728 0.08733903 -0.36879172 -0.201817901 5.608896e-14
## KOB2 -0.10550063 0.2921487 0.17393822 -0.09855848 0.022540079 2.537775e-14
## KOB3 -3.13415665 -1.2363129 -3.18746144 -2.69795761 0.764541906 5.364961e-15
## KOB4 5.23293679 1.6369858 2.92107103 3.17416802 -0.671687148 -5.935425e-14
sample_info$projectpca2 <- project.pca$x[,"PC1"]
sample_info$projectpca3 <- project.pca$x[,"PC2"]
Determine the proportion of variance of each component Proportion of variance equals (PC stdev^2) / (sum all PCs stdev^2)
project.pca.proportionvariances <- ((project.pca$sdev^2) / (sum(project.pca$sdev^2)))*100
# Control vs KOA
project.pca.proportionvariances.KOA <- ((project.pca.KOA$sdev^2) / (sum(project.pca.KOA$sdev^2)))*100
#Control vs KOB
project.pca.proportionvariances.KOB <- ((project.pca.KOB$sdev^2) / (sum(project.pca.KOB$sdev^2)))*100
Scree plot of PCA
barplot(project.pca.proportionvariances, cex.names=1, xlab=paste("Principal component (PC), 1-", length(project.pca$sdev)), ylab="Proportion of variation (%)", main="Scree plot", ylim=c(0,100))
Scatter Plot of PCA 1-4
par(xpd=TRUE)
pairs(project.pca$x[,1:4], col=col.sample, main="Principal components analysis bi-plot\nPCs 1-4", pch=16, oma=c(2,2,2,13))
legend("bottomright", cex=0.75, fill = unique(col.sample), legend = c( levels(sample_info$sample)))
Scatter Plot of PCA 5-8
par(xpd=TRUE)
pairs(project.pca$x[,5:8], col=col.sample, main="Principal components analysis bi-plot\nPCs 6-10", pch=16, oma=c(2,2,2,13))
legend("bottomright", cex=0.75, fill = unique(col.sample), legend = c( levels(sample_info$sample)))
Scatter Plots
par(mar=c(4,4,4,4), mfrow=c(2,3), cex=1.0, cex.main=0.5, cex.axis=0.8)
#Plots scatter plot for PC 1 and 2
plot(project.pca$x, type="n", main="Principal components analysis bi-plot", xlab=paste("PC1, ", round(project.pca.proportionvariances[1], 2), "%"), ylab=paste("PC2, ", round(project.pca.proportionvariances[2], 2), "%"))
points(project.pca$x, col=col.sample, pch= c(15, 16, 17, 18), cex=1)
#Plots scatter plot for PC 1 and 3
plot(project.pca$x[,1], project.pca$x[,3], type="n", main="Principal components analysis bi-plot", xlab=paste("PC1, ", round(project.pca.proportionvariances[1], 2), "%"), ylab=paste("PC3, ", round(project.pca.proportionvariances[3], 2), "%"))
points(project.pca$x[,1], project.pca$x[,3], col=col.sample, pch= c(15, 16, 17, 18), cex=1)
#Plots scatter plot for PC 2 and 3
plot(project.pca$x[,2], project.pca$x[,3], type="n",main="Principal components analysis bi-plot", xlab=paste("PC2, ", round(project.pca.proportionvariances[2], 2), "%"), ylab=paste("PC3, ", round(project.pca.proportionvariances[3], 2), "%"))
points(project.pca$x[,2], project.pca$x[,3], col=col.sample, pch= c(15, 16, 17, 18), cex=1)
#Plots scatter plot for PC 2 and 4
plot(project.pca$x[,2], project.pca$x[,4], type="n", main="Principal components analysis bi-plot", xlab=paste("PC2, ", round(project.pca.proportionvariances[2], 2), "%"), ylab=paste("PC4, ", round(project.pca.proportionvariances[4], 2), "%"))
points(project.pca$x[,2], project.pca$x[,4], col=col.sample, pch= c(15, 16, 17, 18), cex=1)
#Plots scatter plot for PC 3 and 4
plot(project.pca$x[,3], project.pca$x[,4], type="n",main="Principal components analysis bi-plot", xlab=paste("PC3, ", round(project.pca.proportionvariances[3], 2), "%"), ylab=paste("PC4, ", round(project.pca.proportionvariances[4], 2), "%"))
points(project.pca$x[,3], project.pca$x[,4], col=col.sample, pch= c(15, 16, 17, 18), cex=1)
par(xpd=TRUE)
plot.new()
legend("bottomright", bty="n", cex=0.8, title="Condition", legend=c("Control 1","Control 2","Control 3","Control 4","KOA1","KOA2","KOA3","KOA4","KOB1","KOB2","KOB3", "KOB4"), col=c(col.sample), pch=c(15, 16, 17, 18))
3D scatter plot of PCAs
library(scatterplot3d)
par(mar=c(4,4,4,4), mfrow=c(2,2), cex=1.0, cex.main=0.8, cex.axis=0.8)
scatterplot3d(project.pca$x[,1:3], angle=-40, main="", color=col.genotype, pch=17, xlab=paste("PC1, ", round(project.pca.proportionvariances[1], 2), "%"), ylab=paste("PC2, ", round(project.pca.proportionvariances[2], 2), "%"), zlab=paste("PC3, ", round(project.pca.proportionvariances[3], 2), "%"), grid=FALSE, box=FALSE)
source('http://www.sthda.com/sthda/RDoc/functions/addgrids3d.r')
addgrids3d(project.pca$x[,2:4], grid = c("xy", "xz", "yz"))
par(xpd=TRUE)
plot.new()
legend("left", bty="n", cex=0.8, title="Condition", c("Control 1","Control 2","Control 3","Control 4","KOA1","KOA2","KOA3","KOA4","KOB1","KOB2","KOB3", "KOB4"), fill=c(col.genotype))
scatterplot3d(project.pca$x[,1:3], angle=40, main="", color=col.condition, pch=17, xlab=paste("PC1, ", round(project.pca.proportionvariances[1], 2), "%"), ylab=paste("PC2, ", round(project.pca.proportionvariances[2], 2), "%"), zlab=paste("PC3, ", round(project.pca.proportionvariances[3], 2), "%"), grid=FALSE, box=FALSE)
source('http://www.sthda.com/sthda/RDoc/functions/addgrids3d.r')
addgrids3d(project.pca$x[,2:4], grid = c("xy", "xz", "yz"))
par(xpd=TRUE)
plot.new()
legend("left", bty="n", cex=0.8, title="Condition", c("Control 1","Control 2","Control 3","Control 4","KOA1","KOA2","KOA3","KOA4","KOB1","KOB2","KOB3", "KOB4"), fill=c(col.condition))
Write out eigenvector 1 eigenvalues as PC1 seperates the control from KO the best
library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:matrixStats':
##
## count
## The following object is masked from 'package:IRanges':
##
## desc
## The following object is masked from 'package:S4Vectors':
##
## rename
project.pca <- prcomp(t(rldMatrix))
summary(project.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 18.7359 15.0966 8.96516 6.97957 5.48836 4.20039 3.64274
## Proportion of Variance 0.4309 0.2797 0.09866 0.05979 0.03697 0.02166 0.01629
## Cumulative Proportion 0.4309 0.7106 0.80928 0.86908 0.90605 0.92771 0.94400
## PC8 PC9 PC10 PC11 PC12
## Standard deviation 3.55537 3.50621 3.30158 3.12896 5.96e-14
## Proportion of Variance 0.01552 0.01509 0.01338 0.01202 0.00e+00
## Cumulative Proportion 0.95951 0.97460 0.98798 1.00000 1.00e+00
project.pca.proportionvariances <- ((project.pca$sdev^2) / (sum(project.pca$sdev^2)))*100
# get the eigenvalues for PC1 for each gene (Control vs KO)
PC1_values <- data.frame(abs(project.pca$rotation[,c("PC1")]))
PC1_values<- data.frame(rownames(PC1_values), PC1_values)
head(PC1_values, n=3)
## rownames.PC1_values. abs.project.pca.rotation...c..PC1....
## ENSG00000227232 ENSG00000227232 0.0044012067
## ENSG00000278267 ENSG00000278267 0.0001644165
## ENSG00000243485 ENSG00000243485 0.0001900020
# Change the column names
names(PC1_values) <- c("Geneid","PC1")
PC1_values <- join(PC1_values, gtf, by="Geneid")
row.names(PC1_values) <- PC1_values$Geneid
head(PC1_values, n=3)
## Geneid PC1 gene_id GeneSymbol
## ENSG00000227232 ENSG00000227232 0.0044012067 ENSG00000227232 WASH7P
## ENSG00000278267 ENSG00000278267 0.0001644165 ENSG00000278267 MIR6859-1
## ENSG00000243485 ENSG00000243485 0.0001900020 ENSG00000243485 MIR1302-2HG
## Chromosome Class Strand
## ENSG00000227232 1:14404-29570 unprocessed_pseudogene -
## ENSG00000278267 1:17369-17436 miRNA -
## ENSG00000243485 1:29554-31109 lncRNA +
# Control vs KOA
PC1_values_KOA <- data.frame(abs(project.pca.KOA$rotation[,c("PC1")]))
PC1_values_KOA <- data.frame(rownames(PC1_values_KOA), PC1_values_KOA)
head(PC1_values, n=3)
## Geneid PC1 gene_id GeneSymbol
## ENSG00000227232 ENSG00000227232 0.0044012067 ENSG00000227232 WASH7P
## ENSG00000278267 ENSG00000278267 0.0001644165 ENSG00000278267 MIR6859-1
## ENSG00000243485 ENSG00000243485 0.0001900020 ENSG00000243485 MIR1302-2HG
## Chromosome Class Strand
## ENSG00000227232 1:14404-29570 unprocessed_pseudogene -
## ENSG00000278267 1:17369-17436 miRNA -
## ENSG00000243485 1:29554-31109 lncRNA +
# Change the column names
names(PC1_values_KOA) <- c("Geneid","PC1")
PC1_values_KOA <- join(PC1_values_KOA, gtf, by="Geneid")
row.names(PC1_values_KOA) <- PC1_values_KOA$Geneid
head(PC1_values, n=3)
## Geneid PC1 gene_id GeneSymbol
## ENSG00000227232 ENSG00000227232 0.0044012067 ENSG00000227232 WASH7P
## ENSG00000278267 ENSG00000278267 0.0001644165 ENSG00000278267 MIR6859-1
## ENSG00000243485 ENSG00000243485 0.0001900020 ENSG00000243485 MIR1302-2HG
## Chromosome Class Strand
## ENSG00000227232 1:14404-29570 unprocessed_pseudogene -
## ENSG00000278267 1:17369-17436 miRNA -
## ENSG00000243485 1:29554-31109 lncRNA +
#Control vs KOB
PC1_values_KOB <- data.frame(abs(project.pca.KOB$rotation[,c("PC1")]))
PC1_values_KOB <- data.frame(rownames(PC1_values_KOB), PC1_values_KOB)
head(PC1_values, n=3)
## Geneid PC1 gene_id GeneSymbol
## ENSG00000227232 ENSG00000227232 0.0044012067 ENSG00000227232 WASH7P
## ENSG00000278267 ENSG00000278267 0.0001644165 ENSG00000278267 MIR6859-1
## ENSG00000243485 ENSG00000243485 0.0001900020 ENSG00000243485 MIR1302-2HG
## Chromosome Class Strand
## ENSG00000227232 1:14404-29570 unprocessed_pseudogene -
## ENSG00000278267 1:17369-17436 miRNA -
## ENSG00000243485 1:29554-31109 lncRNA +
# Change the column names
names(PC1_values_KOB) <- c("Geneid","PC1")
PC1_values_KOB <- join(PC1_values_KOB, gtf, by="Geneid")
row.names(PC1_values_KOB) <- PC1_values_KOB$Geneid
head(PC1_values, n=3)
## Geneid PC1 gene_id GeneSymbol
## ENSG00000227232 ENSG00000227232 0.0044012067 ENSG00000227232 WASH7P
## ENSG00000278267 ENSG00000278267 0.0001644165 ENSG00000278267 MIR6859-1
## ENSG00000243485 ENSG00000243485 0.0001900020 ENSG00000243485 MIR1302-2HG
## Chromosome Class Strand
## ENSG00000227232 1:14404-29570 unprocessed_pseudogene -
## ENSG00000278267 1:17369-17436 miRNA -
## ENSG00000243485 1:29554-31109 lncRNA +
# write.table(PC1_values_KOB, "../Control vs KOB/PC1.Eigenvalues.wtvskob.csv", col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
# If pulling our two eigenvalues
# wObject <- data.frame(abs(project.pca.KOB$rotation[,c("PC1","PC2")]))
# to calculate the mean if pulling out two eigenvalues
#PC1_PC2_values <- data.frame(abs(project.pca$rotation[,c("PC1","PC2")]))
#PC1_PC2_values <- data.frame(rownames(PC1_PC2_values), PC1_PC2_values, apply(PC1_PC2_values, 1, mean))
#PC1_PC2_values$mean <- (PC1_PC2_values$PC1+PC1_PC2_values$PC2)/2
#names(PC1_PC2_values) <- c("GeneID","PC1", "PC2", "Mean", "mean")
#PC1_PC2_values
#write.table(PC1_PC2_values, "PC1.PC2.Eigenvalues.csv", col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
Order the results
# Default of order is ascending
# We want to order according to the highest eigenvalue so we want descending
sorted_PC1_values <- PC1_values[order(-PC1_values$PC1), ]
head(sorted_PC1_values)
## Geneid PC1 gene_id GeneSymbol
## ENSG00000198681 ENSG00000198681 0.11484541 ENSG00000198681 MAGEA1
## ENSG00000221867 ENSG00000221867 0.10906390 ENSG00000221867 MAGEA3
## ENSG00000197172 ENSG00000197172 0.10716853 ENSG00000197172 MAGEA6
## ENSG00000136167 ENSG00000136167 0.10545251 ENSG00000136167 LCP1
## ENSG00000170627 ENSG00000170627 0.10459982 ENSG00000170627 GTSF1
## ENSG00000130844 ENSG00000130844 0.07799242 ENSG00000130844 ZNF331
## Chromosome Class Strand
## ENSG00000198681 X:153179284-153183880 protein_coding +
## ENSG00000221867 X:152698767-152702347 protein_coding +
## ENSG00000197172 X:152766136-152769747 protein_coding -
## ENSG00000136167 13:46125920-46211871 protein_coding -
## ENSG00000170627 12:54455950-54473602 protein_coding -
## ENSG00000130844 19:53519527-53580269 protein_coding +
sorted_PC1_values_KOA <- PC1_values_KOA[order(-PC1_values_KOA$PC1), ]
sorted_PC1_values_KOB <- PC1_values_KOB[order(-PC1_values_KOB$PC1), ]
Take top 500 genes # wObject <- data.frame(sort(abs(project.pca$rotation[,c(“PC1”,“PC2”)]), decreasing=TRUE)[1:500])
top500_PCA <- data.frame(sort(abs(project.pca$rotation[,c("PC1")]), decreasing=TRUE)[1:500])
top500_PCA_KOA <- PC1_values_KOA[order(-PC1_values_KOA$PC1),][1:500,]
top500_PCA_KOB <- PC1_values_KOB[order(-PC1_values_KOB$PC1),][1:500,]
Take Top 20 genes sorted by PC value
top20_PCA <- PC1_values[order(-PC1_values$PC1),][1:20,]
DeSeq2 uses a negative binomial model to fit the observed read counts to arrive at the estimate for the difference. We need to estimate two parameters from the read counts: the mean as well as the dispersion. The null hypothesis is that there is no systematic difference between the average read count values of the different conditions for a given gene. The p-values are calculated and both tests are some variation of the well-known t-test (How dissimilar are the means of two populations?) or ANOVAs (How well does a reduced model capture the data when compared to the full model with all coefficients?). Once you’ve obtained a list of p-values for all the genes of your data set, it is important to realize that you just performed the same type of test for thousands and thousands of genes. That means, that even if you decide to focus on genes with a p-value smaller than 0.05, if you’ve looked at 10,000 genes your nal list may contain 0:05/10; 000 = 500 false positive hits. To guard yourself against this, all the tools will over some sort of correction for the multiple hypotheses you tested, e.g. in the form of the Benjamini-Hochberg formula. You should defnitely rely on the adjusted p-values rather than the original ones to zoom into possible candidates for downstream analyses and follow-up studies.
Running DEG analysis Here we want to look at the effect of the KO’s versus the wildtype samples, with the wildtype values used as the denominator for the fold change calculation.
DESeq2 uses the levels of the condition to determine the order of the comparison so it important to set WT as the first-level factor
str(colData(dds)$genotype)
## Factor w/ 2 levels "WT","KO": 1 1 1 1 2 2 2 2 2 2 ...
colData(dds)$condition <- relevel(colData(dds)$genotype, "WT")
Run DeSeq2 The log2 fold change and Wald test p value will be for the last variable in the design formula (in our case just genotype), and if this is a factor, the comparison will be the last level of this variable (KO) over the reference level (control) The order of the variables of the design do not matter so long as the user specifies the comparison to build a results table for, using the name or contrast arguments of results.
The Wald method is the default
dds <- DESeq(dds, test="Wald")
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 18 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
dds_KOA <- DESeq(dds_KOA, test="Wald")
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dds_KOB <- DESeq(dds_KOB, test="Wald")
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Log fold change shrinkage for visualisation and ranking Shrinkage of effect size (LFC estimates) is useful for visualization and ranking of genes.
resultsNames(dds)
## [1] "Intercept" "genotype_KO_vs_WT"
resLFC <- lfcShrink(dds, coef="genotype_KO_vs_WT", type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
head(resLFC, n=3)
## log2 fold change (MAP): genotype KO vs WT
## Wald test p-value: genotype KO vs WT
## DataFrame with 3 rows and 5 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## ENSG00000227232 7.6623639348898 -0.656891061460584 0.434282096622347
## ENSG00000278267 0.647706347116327 0.0177831889138679 0.351712904713223
## ENSG00000243485 0.389561598236757 -0.0229189620614307 0.354630906453473
## pvalue padj
## <numeric> <numeric>
## ENSG00000227232 0.0170449666854816 0.0640026412501712
## ENSG00000278267 0.809212361995228 NA
## ENSG00000243485 0.791280314384341 NA
By default the argument alpha is set to 0.1. If the adjusted p value cutoff will be a value other than 0.1, alpha should be set to that value: results function automatically performs independent filtering based on the mean of normalized counts for each gene, optimizing the number of genes which will have an adjusted p value below a given FDR cutoff, alpha.
Run the differential tests on the counts matrix and use FDR correction NB - use ‘independentFiltering=FALSE’ and ‘cooksCutoff=FALSE’ to switch off conversion of values to ‘NA’ if failing FDR conversion or Cook’s Distance, respectively
unique(sample_info$genotype)
## [1] WT KO
## Levels: WT KO
library(DESeq2)
options(scipen=999)
## Simple Extraction of the results
res <- results(dds)
summary(res)
##
## out of 32385 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 3535, 11%
## LFC < 0 (down) : 3086, 9.5%
## outliers [1] : 0, 0%
## low counts [2] : 10674, 33%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
## Modifying extraction for alpha = 0.05
ControlvsKO <- results(dds, pAdjustMethod="BH", independentFiltering = TRUE, parallel = FALSE, alpha=0.05)
Results_convsko <- as.data.frame(ControlvsKO)
Results_convsko <- data.frame(rownames(Results_convsko), Results_convsko, row.names=rownames(Results_convsko))
names(Results_convsko)[1] <- "Geneid"
options(scipen=999)
head(Results_convsko)
## Geneid baseMean log2FoldChange lfcSE stat
## ENSG00000227232 ENSG00000227232 7.6623639 -0.9743474 0.4084053 -2.3857363
## ENSG00000278267 ENSG00000278267 0.6477063 0.3820494 1.5823632 0.2414423
## ENSG00000243485 ENSG00000243485 0.3895616 -0.4809456 1.8173001 -0.2646484
## ENSG00000238009 ENSG00000238009 0.2783860 -1.2239389 2.4179691 -0.5061847
## ENSG00000241860 ENSG00000241860 3.1078482 -0.9205363 0.6737639 -1.3662593
## ENSG00000279928 ENSG00000279928 0.5291645 -0.6922107 1.6836259 -0.4111428
## pvalue padj
## ENSG00000227232 0.01704497 0.06400264
## ENSG00000278267 0.80921236 NA
## ENSG00000243485 0.79128031 NA
## ENSG00000238009 0.61272702 NA
## ENSG00000241860 0.17185758 0.34792987
## ENSG00000279928 0.68096783 NA
# Merge with the file which contains geneid and gene names
Results_convsko <- join(Results_convsko, gtf,by="Geneid")
row.names(Results_convsko) <- Results_convsko$Geneid
head(Results_convsko)
## Geneid baseMean log2FoldChange lfcSE stat
## ENSG00000227232 ENSG00000227232 7.6623639 -0.9743474 0.4084053 -2.3857363
## ENSG00000278267 ENSG00000278267 0.6477063 0.3820494 1.5823632 0.2414423
## ENSG00000243485 ENSG00000243485 0.3895616 -0.4809456 1.8173001 -0.2646484
## ENSG00000238009 ENSG00000238009 0.2783860 -1.2239389 2.4179691 -0.5061847
## ENSG00000241860 ENSG00000241860 3.1078482 -0.9205363 0.6737639 -1.3662593
## ENSG00000279928 ENSG00000279928 0.5291645 -0.6922107 1.6836259 -0.4111428
## pvalue padj gene_id GeneSymbol
## ENSG00000227232 0.01704497 0.06400264 ENSG00000227232 WASH7P
## ENSG00000278267 0.80921236 NA ENSG00000278267 MIR6859-1
## ENSG00000243485 0.79128031 NA ENSG00000243485 MIR1302-2HG
## ENSG00000238009 0.61272702 NA ENSG00000238009 AL627309.1
## ENSG00000241860 0.17185758 0.34792987 ENSG00000241860 AL627309.5
## ENSG00000279928 0.68096783 NA ENSG00000279928 FO538757.1
## Chromosome Class Strand
## ENSG00000227232 1:14404-29570 unprocessed_pseudogene -
## ENSG00000278267 1:17369-17436 miRNA -
## ENSG00000243485 1:29554-31109 lncRNA +
## ENSG00000238009 1:89295-133723 lncRNA -
## ENSG00000241860 1:141474-173862 lncRNA -
## ENSG00000279928 1:182696-184174 unprocessed_pseudogene +
# Control vs KOA
ControlvsKOA <- results(dds_KOA, pAdjustMethod="BH", independentFiltering = TRUE, parallel = FALSE, alpha=0.05)
Results_KOA <- as.data.frame(ControlvsKOA)
Results_KOA <- data.frame(rownames(Results_KOA), Results_KOA, row.names=rownames(Results_KOA))
names(Results_KOA)[1] <- "Geneid"
options(scipen=999)
head(Results_KOA)
## Geneid baseMean log2FoldChange lfcSE stat
## ENSG00000227232 ENSG00000227232 7.9398237 -1.2331087 0.5160606 -2.38946482
## ENSG00000278267 ENSG00000278267 0.4878924 -0.1917910 2.0821960 -0.09210994
## ENSG00000243485 ENSG00000243485 0.3688241 -0.8274624 2.4135298 -0.34284326
## ENSG00000238009 ENSG00000238009 0.2677781 -1.5744640 3.0121374 -0.52270656
## ENSG00000241860 ENSG00000241860 3.4177048 -0.9133677 0.8521401 -1.07185165
## ENSG00000279928 ENSG00000279928 0.3574138 -2.0529070 2.7279481 -0.75254621
## pvalue padj
## ENSG00000227232 0.01687294 0.0548153
## ENSG00000278267 0.92661069 NA
## ENSG00000243485 0.73171638 NA
## ENSG00000238009 0.60117847 NA
## ENSG00000241860 0.28378667 0.4677380
## ENSG00000279928 0.45172265 NA
Results_KOA <- join(Results_KOA, gtf,by="Geneid")
row.names(Results_KOA) <- Results_KOA$Geneid
head(Results_KOA)
## Geneid baseMean log2FoldChange lfcSE stat
## ENSG00000227232 ENSG00000227232 7.9398237 -1.2331087 0.5160606 -2.38946482
## ENSG00000278267 ENSG00000278267 0.4878924 -0.1917910 2.0821960 -0.09210994
## ENSG00000243485 ENSG00000243485 0.3688241 -0.8274624 2.4135298 -0.34284326
## ENSG00000238009 ENSG00000238009 0.2677781 -1.5744640 3.0121374 -0.52270656
## ENSG00000241860 ENSG00000241860 3.4177048 -0.9133677 0.8521401 -1.07185165
## ENSG00000279928 ENSG00000279928 0.3574138 -2.0529070 2.7279481 -0.75254621
## pvalue padj gene_id GeneSymbol
## ENSG00000227232 0.01687294 0.0548153 ENSG00000227232 WASH7P
## ENSG00000278267 0.92661069 NA ENSG00000278267 MIR6859-1
## ENSG00000243485 0.73171638 NA ENSG00000243485 MIR1302-2HG
## ENSG00000238009 0.60117847 NA ENSG00000238009 AL627309.1
## ENSG00000241860 0.28378667 0.4677380 ENSG00000241860 AL627309.5
## ENSG00000279928 0.45172265 NA ENSG00000279928 FO538757.1
## Chromosome Class Strand
## ENSG00000227232 1:14404-29570 unprocessed_pseudogene -
## ENSG00000278267 1:17369-17436 miRNA -
## ENSG00000243485 1:29554-31109 lncRNA +
## ENSG00000238009 1:89295-133723 lncRNA -
## ENSG00000241860 1:141474-173862 lncRNA -
## ENSG00000279928 1:182696-184174 unprocessed_pseudogene +
# Control vs KOB
ControlvsKOB <- results(dds_KOB, pAdjustMethod="BH", independentFiltering = TRUE, parallel = FALSE, alpha=0.05)
Results_KOB <- as.data.frame(ControlvsKOB)
Results_KOB <- data.frame(rownames(Results_KOB), Results_KOB, row.names=rownames(Results_KOB))
names(Results_KOB)[1] <- "Geneid"
options(scipen=999)
head(Results_KOB)
## Geneid baseMean log2FoldChange lfcSE stat
## ENSG00000227232 ENSG00000227232 9.0073108 -0.7552737 0.5093023 -1.48295763
## ENSG00000278267 ENSG00000278267 0.7491637 0.8017708 1.9002480 0.42192958
## ENSG00000243485 ENSG00000243485 0.4714222 -0.1774138 2.1746931 -0.08158106
## ENSG00000238009 ENSG00000238009 0.4129118 -0.8760893 2.4293887 -0.36062131
## ENSG00000241860 ENSG00000241860 3.4335950 -0.9342857 0.8377007 -1.11529783
## ENSG00000279928 ENSG00000279928 0.7873172 0.2396154 1.6939425 0.14145428
## pvalue padj
## ENSG00000227232 0.1380857 0.2730521
## ENSG00000278267 0.6730764 NA
## ENSG00000243485 0.9349799 NA
## ENSG00000238009 0.7183826 NA
## ENSG00000241860 0.2647228 0.4374984
## ENSG00000279928 0.8875111 NA
Results_KOB <- join(Results_KOB, gtf,by="Geneid")
row.names(Results_KOB) <- Results_KOB$Geneid
head(Results_KOB)
## Geneid baseMean log2FoldChange lfcSE stat
## ENSG00000227232 ENSG00000227232 9.0073108 -0.7552737 0.5093023 -1.48295763
## ENSG00000278267 ENSG00000278267 0.7491637 0.8017708 1.9002480 0.42192958
## ENSG00000243485 ENSG00000243485 0.4714222 -0.1774138 2.1746931 -0.08158106
## ENSG00000238009 ENSG00000238009 0.4129118 -0.8760893 2.4293887 -0.36062131
## ENSG00000241860 ENSG00000241860 3.4335950 -0.9342857 0.8377007 -1.11529783
## ENSG00000279928 ENSG00000279928 0.7873172 0.2396154 1.6939425 0.14145428
## pvalue padj gene_id GeneSymbol Chromosome
## ENSG00000227232 0.1380857 0.2730521 ENSG00000227232 WASH7P 1:14404-29570
## ENSG00000278267 0.6730764 NA ENSG00000278267 MIR6859-1 1:17369-17436
## ENSG00000243485 0.9349799 NA ENSG00000243485 MIR1302-2HG 1:29554-31109
## ENSG00000238009 0.7183826 NA ENSG00000238009 AL627309.1 1:89295-133723
## ENSG00000241860 0.2647228 0.4374984 ENSG00000241860 AL627309.5 1:141474-173862
## ENSG00000279928 0.8875111 NA ENSG00000279928 FO538757.1 1:182696-184174
## Class Strand
## ENSG00000227232 unprocessed_pseudogene -
## ENSG00000278267 miRNA -
## ENSG00000243485 lncRNA +
## ENSG00000238009 lncRNA -
## ENSG00000241860 lncRNA -
## ENSG00000279928 unprocessed_pseudogene +
The DESeqResult object can basically be handled like a data.frame
table(ControlvsKO$padj < 0.05)
##
## FALSE TRUE
## 16309 5402
table(ControlvsKO$padj < 0.05 & abs(ControlvsKO$log2FoldChange)>2)
##
## FALSE TRUE
## 30975 595
Combine the deseq2 results with pc values
merged_pcresults <- merge(Results_convsko, sorted_PC1_values, by="Geneid")
merged_pcresults_KOA <- merge(Results_KOA, sorted_PC1_values_KOA, by="Geneid")
merged_pcresults_KOB <- merge(Results_KOB, sorted_PC1_values_KOB, by="Geneid")
Sort results based on PC values
merged_pcresults_sorted <- merged_pcresults[order(-merged_pcresults$PC1),]
row.names(merged_pcresults_sorted) <- merged_pcresults_sorted$Geneid
merged_pcresults_sorted <- merged_pcresults_sorted[,-c(14:18)]
names(merged_pcresults_sorted)[9:12] <- c("GeneSymbol", "Chromosome", "Class","Strand")
merged_pcresults_sorted_KOA <- merged_pcresults_KOA[order(-merged_pcresults_KOA$PC1),]
row.names(merged_pcresults_sorted_KOA) <- merged_pcresults_sorted_KOA$Geneid
merged_pcresults_sorted_KOA <- merged_pcresults_sorted_KOA[,-c(14:18)]
names(merged_pcresults_sorted_KOA)[9:12] <- c("GeneSymbol", "Chromosome", "Class","Strand")
merged_pcresults_sorted_KOB <- merged_pcresults_KOB[order(-merged_pcresults_KOB$PC1),]
row.names(merged_pcresults_sorted_KOB) <- merged_pcresults_sorted_KOB$Geneid
merged_pcresults_sorted_KOB <- merged_pcresults_sorted_KOB[,-c(14:18)]
names(merged_pcresults_sorted_KOB)[9:12] <- c("GeneSymbol", "Chromosome", "Class","Strand")
Take the top 20 genes according to PCA analysis to obtain their LFC and padj values
merged_pcresults_sorted_top20 <- merged_pcresults_sorted[order(-merged_pcresults_sorted$PC1),][1:20,]
merged_pcresults_sorted_top20_KOA <- merged_pcresults_sorted_KOA[order(-merged_pcresults_sorted_KOA$PC1),][1:20,]
merged_pcresults_sorted_top20_KOB <- merged_pcresults_sorted_KOB[order(-merged_pcresults_sorted_KOB$PC1),][1:20,]
Take the top 500 genes according to PCA analysis to obtain their LFC and padj values
merged_pcresults_sorted_top500 <- merged_pcresults_sorted[order(-merged_pcresults_sorted$PC1),][1:500,]
merged_pcresults_sorted_top500_KOA <- merged_pcresults_sorted_KOA[order(-merged_pcresults_sorted_KOA$PC1),][1:500,]
merged_pcresults_sorted_top500_KOB <- merged_pcresults_sorted_KOB[order(-merged_pcresults_sorted_KOB$PC1),][1:500,]
How many genes have a padj < 0.05 Control vs KO
sum(merged_pcresults_sorted$padj < 0.05, na.rm=TRUE)
## [1] 5402
Control vs KOA
sum(merged_pcresults_sorted_KOA$padj < 0.05, na.rm=TRUE)
## [1] 6470
Control vs KOB
sum(merged_pcresults_sorted_KOB$padj < 0.05, na.rm=TRUE)
## [1] 6681
Filter results to those with P<0.05
merged_pcresults_sorted_sig <- subset(merged_pcresults_sorted, padj<0.05)
Filter results to those with a log2FC>2
merged_pcresults_sorted_sigFC2 <- subset(merged_pcresults_sorted_sig, abs(log2FoldChange)>=2)
Can do this in one command - P<0.05 & logFC>|2|
merged_pcresults_sorted_sigFC2 <- subset(merged_pcresults_sorted, padj<=0.05 & abs(log2FoldChange)>=2)
# Control vs KOA
merged_pcresults_sorted_KOA_sigFC2 <- subset(merged_pcresults_sorted_KOA, padj<=0.05 & abs(log2FoldChange)>=2)
# Control vs KOB
merged_pcresults_sorted_KOB_sigFC2 <- subset(merged_pcresults_sorted_KOB, padj<=0.05 & abs(log2FoldChange)>=2)
Order the results by the smallest pvalue
merged_pcresults_sorted_pvalue <- merged_pcresults_sorted[order(merged_pcresults_sorted$padj), ]
Filter the genes which pass a fold change of > 1.5 and padj < 0.05
merged_pcresults_sorted_sigFC15 <- subset(merged_pcresults_sorted_sig, padj<=0.05 & abs(log2FoldChange)>=1.5)
Filter for upregulated genes
upregulated_sig_DEGS <- subset(merged_pcresults_sorted_sigFC2, log2FoldChange>0)
upregulated_sig_DEGS_15 <- subset(merged_pcresults_sorted_sigFC15, log2FoldChange>0)
# Control vs KOA
upregulated_sig_DEGs_KOA <- subset(merged_pcresults_sorted_KOA_sigFC2, log2FoldChange>0)
# Control vs KOB
upregulated_sig_DEGs_KOB <- subset(merged_pcresults_sorted_KOB_sigFC2, log2FoldChange>0)
Filter for downregulated genes
downregulated_sig_DEGS <- subset(merged_pcresults_sorted_sigFC2, log2FoldChange < 0)
downregulated_sig_DEGS_15 <- subset(merged_pcresults_sorted_sigFC15, log2FoldChange < 0)
# Control vs KOA
downregulated_sig_DEGs_KOA <- subset(merged_pcresults_sorted_KOA_sigFC2, log2FoldChange<0)
# Control vs KOB
downregulated_sig_DEGs_KOB <- subset(merged_pcresults_sorted_KOB_sigFC2, log2FoldChange<0)
Obtain gene list
DEG_geneids_FC2 <- row.names(merged_pcresults_sorted_sigFC2)
DEG_genenames_FC2 <- data.frame(merged_pcresults_sorted_sigFC2$GeneSymbol)
names(DEG_genenames_FC2) <- c("GeneSymbol")
head(DEG_genenames_FC2, n=3)
## GeneSymbol
## 1 MAGEA1
## 2 MAGEA3
## 3 MAGEA6
MA Plots The MA plot provides a global view of the relationship between the expression change between conditions (log ratios, M), the average expression strength of the genes (average mean, A) and the ability of the algorithm to detect differential gene expression: genes that pass the significance threshold (adjusted p-value<0.05) are colored in red
Points will be colored red if the adjusted p value is less than 0.05
# Add two lines where logFC > 2 and logFC < -2
drawLines <- function() abline(h=c(-2,2),col="dodgerblue",lwd=2)
par(mar=c(4,4,4,4), mfrow=c(1,1), cex=1.0, cex.main=0.5, cex.axis=0.8)
plot.new()
# MA plot of significant DEGs
plotMA(ControlvsKO, alpha=0.05, main="WT vs. KO")
# It is more useful visualize the MA-plot for the shrunken log2 fold changes, which remove the noise associated with log2 fold changes from low count genes without requiring arbitrary filtering thresholds.
plotMA(resLFC, alpha=0.05, main="WT vs. KO Log Fold Shrinkage")
## Changing the thresholds to what you want to see
## plotting all those which pass p<0.05 with a folchange >2 or <2 or between -2->2
## lfcThreshold of 1, means log2 fold change is 2
par(mfrow=c(2,2),mar=c(2,2,1,1))
ylim <- c(-10,10)
# Two tailed test where log2 FC is > or < lfc threshold
resGA <- results(dds, lfcThreshold=log2(2), alpha=0.05, altHypothesis="greaterAbs")
resGA <- results(dds, lfcThreshold=1, alpha=0.05, altHypothesis="greaterAbs")
# p values are the maximum of the upper and lower tests
resLA <- results(dds, lfcThreshold=log2(2), alpha=0.05, altHypothesis="lessAbs")
# log2 FC greater than threshold and P<0.05
resG <- results(dds, lfcThreshold=log2(2), alpha=0.05, altHypothesis="greater")
# log 2 FC smaller than threshold and P<0.05
resL <- results(dds, lfcThreshold=log2(2), alpha=0.05, altHypothesis="less")
drawLines <- function() abline(h=c(-1,1),col="dodgerblue",lwd=2)
plotMA(resGA, ylim=ylim, alpha=0.05); drawLines()
plotMA(resLA, ylim=ylim, alpha=0.05); drawLines()
plotMA(resG, ylim=ylim, alpha=0.05); drawLines()
plotMA(resL, ylim=ylim, alpha=0.05); drawLines()
# Test genes which have fold change more than doubling or less than halving
# lfcThreshold of 1, means log2 fold change is 2 - doubling
#res.thr <- results(dds, lfcThreshold=1, alpha=0.05)
#plotMA(res.thr, ylim=c(-10,10))
Histogram of p-values
par(mfrow=c(1,1), cex=1.5)
hist(ControlvsKO$padj,
col= "grey", border = "white", xlab = "P-adjusted value", ylab = "Frequency",
main = "Frequencies of p-values for Control vs KO")
Bar Plots of Gene expression Plot the gene with the minimum adjusted p-value
plotCounts(dds, gene=which.min(ControlvsKO$padj), intgroup="condition", normalized=TRUE)
More sophisticated plot
library(ggplot2)
data <- plotCounts(dds, gene=which.min(ControlvsKO$padj), intgroup=c("genotype","condition","rep", "sample"), returnData=TRUE)
ggplot(data, aes(x=genotype, y=count, shape=rep, colour=condition)) +
scale_shape_manual(name="Replicate",
labels=c("1","2","3","4"),
values = rep(c(15,16,17,18))) +
scale_colour_manual(name="Condition",
labels=c("Control", "KO"),
values = rep(c("#B2DFEE","#FFA500"))) +
geom_point(position=position_jitter(width=.1,height=0)) +
ggtitle("Expression of RPL22L1") +
scale_y_log10()
Read counts of single genes DESeq2 offers a wrapper function to plot read counts for single genes
library (grDevices ) # for italicizing the gene name
# EPO: ENSG00000130427
data <- plotCounts(dds, gene="ENSG00000130427", intgroup=c("genotype","condition","rep", "sample"), returnData=TRUE)
ggplot(data, aes(x=genotype, y=count, col=sample_info$condition, shape=rep)) +
geom_point(position=position_jitter(width=.1,height=0)) +
ggtitle("Expression of EPO") +
scale_y_log10()
Plot the expression of the top 50 genes on a bar plot
# load in packages
library(tibble)
## Warning: package 'tibble' was built under R version 3.6.2
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
##
## expand
## The following object is masked from 'package:magrittr':
##
## extract
library(ggplot2)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ✓ purrr 0.3.3
## ── Conflicts ─────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x plyr::arrange() masks dplyr::arrange()
## x dplyr::collapse() masks IRanges::collapse()
## x dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## x purrr::compact() masks plyr::compact()
## x plyr::count() masks dplyr::count(), matrixStats::count()
## x plyr::desc() masks dplyr::desc(), IRanges::desc()
## x tidyr::expand() masks S4Vectors::expand()
## x tidyr::extract() masks magrittr::extract()
## x plyr::failwith() masks dplyr::failwith()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks S4Vectors::first()
## x plyr::id() masks dplyr::id()
## x dplyr::lag() masks stats::lag()
## x plyr::mutate() masks dplyr::mutate()
## x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## x purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## x plyr::rename() masks dplyr::rename(), S4Vectors::rename()
## x purrr::set_names() masks magrittr::set_names()
## x purrr::simplify() masks DelayedArray::simplify()
## x dplyr::slice() masks IRanges::slice()
## x plyr::summarise() masks dplyr::summarise()
## x plyr::summarize() masks dplyr::summarize()
# create tibble
merged_pcresults_sorted_tb <- merged_pcresults_sorted %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble()
# subset to significant DEGs
DEGs_ControlvsKO_tb <- subset(merged_pcresults_sorted_tb, abs(log2FoldChange) >= 2 & padj<0.05)
# pull out the top 50 DEGS based on pvalue
top50_DEGs_genes <- DEGs_ControlvsKO_tb %>%
arrange(-PC1) %>% #Arrange rows by padj values (can change this if you want to any column)
pull(gene) %>% #Extract character vector of ordered genes
head(n=50) # change this to number you want to pull out
top50_DEG_genes <- merged_pcresults_sorted[top50_DEGs_genes,] # pull out from merged pcresults
top50_DEG_genes <- data.frame(top50_DEG_genes) #make into a data frame
top50_DEG_genes$Geneid <- row.names(top50_DEG_genes) # change row names to Gene ID
top50_DEG_genes <- merge(top50_DEG_genes, gtf, by="Geneid") # merge with gtf file so can obtain gene names
#change the normalised counts into a tibble
normalized_counts_tb <- norm %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble()
# pull out the top 50 genes from the normalised counts tibble
top50_DEG_norm <- normalized_counts_tb %>%
filter(gene %in% top50_DEGs_genes)
gathered_top50_DEGs <- top50_DEG_norm %>%
tidyr::gather(colnames(top50_DEG_norm)[2:13], key = "sample", value = "normalized_counts")
gathered_top50_DEGs <- inner_join(gathered_top50_DEGs,sample_info, by="sample")
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
names(gathered_top50_DEGs)[1] <- "Geneid"
gathered_top50_DEGs <- inner_join(gtf, gathered_top50_DEGs, by="Geneid")
## Warning: Column `Geneid` joining factor and character vector, coercing into
## character vector
gathered_top50_DEGs <- inner_join(merged_pcresults_sorted, gathered_top50_DEGs, by="Geneid")
## Warning: Column `Geneid` joining factor and character vector, coercing into
## character vector
ggplot(gathered_top50_DEGs) +
geom_point(aes(x = GeneSymbol.x, y = normalized_counts, color = condition)) +
xlab("Genes") +
ylab("Regularised Log Counts") +
ggtitle("Top 50 Significantly Differentially Expressed Genes (P<0.05 & abs(Log2FC) > 2)") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=6)) +
theme(plot.title = element_text(hjust = 0.5, size=10))
Heat Map Plotting Heat maps of the count matrix
library(pheatmap)
select <- order(rowMeans(counts(dds,normalized=TRUE)),
decreasing=TRUE)[1:50]
df <- as.data.frame(colData(dds)[,c("genotype", "condition", "rep")])
#Heat map of the Count matrix
pheatmap(data.matrix(norm[select,]), cluster_rows=F, show_rownames=TRUE,
cluster_cols=FALSE, annotation_col=df, main="Normalised Counts", cex=0.8)
# on the vsd data
pheatmap(assay(vsd)[select,], cluster_rows=T, show_rownames=F,
cluster_cols=FALSE, annotation_col=df, main="VSD Transformation")
# onthe rlog data
pheatmap(assay(rld)[select,], cluster_rows=T, show_rownames=FALSE,
cluster_cols=F, annotation_col=df, main="rLog Transformation")
Heat map of the top most expressed genes based on comparing individual transcripts across all samples Select the most highly expressed genes from the study and perform clustering
library("RColorBrewer")
library("gplots")
par(cex=1.0)
select <- order(rowMeans(norm), decreasing=TRUE)[1:50]
hmcol <- colorRampPalette(brewer.pal(9, "BuGn"))(50)
y <- data.matrix(norm[select,])
# now add gene names rather than symbols
ya <- data.frame(y)
ya$Geneid <- row.names(ya)
ya <- join(ya,gtf, by="Geneid")
row.names(ya) <- ya$GeneSymbol
ya <- ya[,-c(13:18)]
ya <- data.matrix(ya)
heatmap.2(ya, main="All samples", cexRow=0.8, cexCol=1.2, offsetCol=1,labCol=c("Control", NA, NA,NA," KOA", NA,NA,NA, " KOB",NA,NA,NA), col=hmcol, Rowv=TRUE, Colv=F, scale="none", ColSideColors=c("#B2DFEE","#B2DFEE", "#B2DFEE", "#B2DFEE", "#FFA500","#FFA500","#FFA500","#FFA500","#FF1493","#FF1493","#FF1493","#FF1493"), srtCol=0, adjCol=c(-2.2,-68),dendrogram="none", trace="none", key.title="Counts density", key.xlab="Counts", key.ylab="")
par(xpd=T)
text(0.05,0.65, "Increasing\nexpression\n(top 50)")
arrows(0.05, 0.6, 0.05, 0.4, lwd=3, xpd=T)
Plot the top 20 DEGs sorted by PC1
topgenes <- head(rownames(merged_pcresults_sorted_sigFC2),20)
topgenes
## [1] "ENSG00000198681" "ENSG00000221867" "ENSG00000197172" "ENSG00000136167"
## [5] "ENSG00000170627" "ENSG00000130844" "ENSG00000011677" "ENSG00000143320"
## [9] "ENSG00000133169" "ENSG00000101160" "ENSG00000251381" "ENSG00000249568"
## [13] "ENSG00000076242" "ENSG00000198185" "ENSG00000188511" "ENSG00000168672"
## [17] "ENSG00000224817" "ENSG00000204389" "ENSG00000181007" "ENSG00000101986"
mat <- rldMatrix[topgenes,]
mat <- mat - rowMeans(mat)
mat2 <- data.frame(mat)
mat2$Geneid <- row.names(mat2)
mat2 <- join(mat2, gtf,by="Geneid")
row.names(mat2) <- mat2$GeneSymbol
mat2 <- mat2[,-c(13:18)]
mat2 <- data.matrix(mat2)
hmcol <- colorRampPalette(brewer.pal(11, "RdBu"))(50)
df <- as.data.frame(colData(dds)[,c("genotype", "condition")])
heatmap.2(mat2, main="Top 20 DEGs", cexRow=0.8, cexCol=0.8, col=hmcol, Rowv=T, Colv=F, labCol=c("Control", NA,NA,NA," KOB",NA,NA,NA), adjCol=c(-1.6,-62),srtCol=0, scale="none", dendrogram="none", trace="none", margin=c(1, 5), key.title="Counts density", key.xlab="Counts", key.ylab="")
Plot the top 50 DEGs sorted by PC1
topgenes <- head(rownames(merged_pcresults_sorted_sigFC2),50)
mat <- rldMatrix[topgenes,]
mat <- mat - rowMeans(mat)
mat2 <- data.frame(mat)
mat2$Geneid <- row.names(mat2)
mat2 <- join(mat2, gtf,by="Geneid")
row.names(mat2) <- mat2$GeneSymbol
mat2 <- mat2[,-c(13:18)]
mat2 <- data.matrix(mat2)
hmcol <- colorRampPalette(brewer.pal(11, "RdBu"))(50)
df <- as.data.frame(colData(dds)[,c("genotype", "condition")])
library(gplots)
heatmap.2(mat2, main="Top 50 DEGs", cexRow=0.6, cexCol=0.8, col=hmcol, Rowv=T, Colv=F, labCol=c("Control", NA,NA,NA," KOB",NA,NA,NA), adjCol=c(-1.6,-62),srtCol=0, scale="none", dendrogram="none", trace="none", margin=c(1, 5), key.title="Counts density", key.xlab="Counts", key.ylab="")
#pheatmap(mat2, annotation_col=df, fontsize_row = 5, fontsize_col = 7, cluster_cols = F,cluster_rows = T, main="Top 50 DEGs \n P<0.05 & abs|Log2 Fold Change|>2")
Scaled to z-scores
# scale = "row" converts to z-score scaling within rows
plot.new()
heatmap.2(mat2, main="Top 50 DEGs", cexRow=0.6, cexCol=0.8, col=hmcol, Rowv=T, Colv=F, labCol=c("Control", NA,NA,NA," KOA",NA,NA,NA," KOB",NA,NA,NA), adjCol=c(-1.6,-62),srtCol=0, scale="row", dendrogram="none", trace="none", margin=c(1, 5), key.title="Counts density", key.xlab="Counts", key.ylab="")
Alternative heatmap
pheatmap(mat2, annotation_col=df, fontsize_row = 5, fontsize_col = 7, cluster_cols = F,cluster_rows = T, main="Top 50 DEGs \n P<0.05 & abs|Log2 Fold Change|>2")
Plotting the top 500 genes by PC1
topgenes <- head(rownames(merged_pcresults_sorted),500)
mat <- rldMatrix[topgenes,]
mat <- mat - rowMeans(mat)
mat2 <- data.frame(mat)
mat2$Geneid <- row.names(mat2)
mat2 <- join(mat2, gtf,by="Geneid")
row.names(mat2) <- mat2$GeneSymbol
mat2 <- mat2[,-c(13:18)]
mat2 <- data.matrix(mat2)
hmcol <- colorRampPalette(brewer.pal(11, "RdBu"))(50)
df <- as.data.frame(colData(dds)[,c("genotype", "condition")])
# scale = "row" converts to z-score scaling within rows
heatmap.2(mat2, main="Top 50 DEGs", cexRow=0.6, cexCol=0.8, col=hmcol, Rowv=T, Colv=F, labCol=c("Control", NA,NA,NA," KOB",NA,NA,NA), adjCol=c(-1.6,-62),srtCol=0, scale="row", dendrogram="none", trace="none", margin=c(1, 5), key.title="Counts density", key.xlab="Counts", key.ylab="")
Plotting all the significant DEGs (p<0.05 & abs|log2FC|>2)
# load the library with the aheatmap () function
library(NMF)
## Loading required package: pkgmaker
## Loading required package: registry
##
## Attaching package: 'pkgmaker'
## The following object is masked from 'package:S4Vectors':
##
## new2
## Loading required package: rngtools
## Loading required package: cluster
## NMF - BioConductor layer [OK] | Shared memory capabilities [NO: bigmemory] | Cores 3/4
## To enable shared memory capabilities, try: install.extras('
## NMF
## ')
##
## Attaching package: 'NMF'
## The following object is masked from 'package:plotrix':
##
## dispersion
## The following object is masked from 'package:DelayedArray':
##
## seed
## The following object is masked from 'package:BiocParallel':
##
## register
## The following object is masked from 'package:S4Vectors':
##
## nrun
# aheatmap needs a matrix of values , e.g., a matrix of DE genes with the transformed read counts for each replicate
# identify gene names with desired cut off and fold change cut off
DEG_genenames_FC2 <- rownames(merged_pcresults_sorted_sigFC2)
head(DEG_genenames_FC2, n=3)
## [1] "ENSG00000198681" "ENSG00000221867" "ENSG00000197172"
DEGs_up <- rownames(merged_pcresults_sorted_sigFC2[merged_pcresults_sorted_sigFC2$log2FoldChange>0,])
DEGs_down <- rownames(merged_pcresults_sorted_sigFC2[merged_pcresults_sorted_sigFC2$log2FoldChange<0,])
# extract the normalized read counts for DE genes into a matrix
hm.mat_DGEgenes <- rldMatrix[DEG_genenames_FC2, ]
hm.mat_DGEgenes_up <- log.norm.counts[DEGs_up, ]
hm.mat_DGEgenes_down <- log.norm.counts[DEGs_down, ]
# plot the normalized read counts of DE genes sorted by the adjusted p- value
aheatmap(hm.mat_DGEgenes, Rowv = NA , Colv = NA)
# combine the heatmap with hierarchical clustering
# aheatmap(hm.mat_DGEgenes,
# Rowv = TRUE, Colv=TRUE, # add dendrograms to rows and columns
#distfun = "euclidean", hclustfun = "average", annRow=NA, labRow=NA)
# scale the read counts per gene to emphasize the sample-type - specific differences
# the read count values are scaled per row so that the colors actually represent z-scores rather than the underlying read counts.
annotation = data.frame(condition=sample_info$condition)
# aheatmap(hm.mat_DGEgenes,
# Rowv = TRUE , Colv = TRUE ,
# distfun = "pearson", hclust = "ward",
# scale = "row",annCol = annotation, labRow=NA, annRow=NA, main="Heat Map \n DEG Genes with P<0.05 & LogFC > |2|") # values are transformed into distances from the center of the row - specific average : ( actual value - mean of the group ) / standard deviation
Complex heat map
sigGeneList <- row.names(data.frame(subset(ControlvsKO, abs(log2FoldChange)>=2 & padj<=0.05)))
myCol <- colorRampPalette(c("green", "black", "red"))(10)
myBreaks <- seq(-3, 3, length.out=10)
heat <- t(rldMatrix)[,sigGeneList]
heat <- t(scale(t(heat)))
sampleOrder <- c(
c("WT1","WT2","WT3","WT4"),
c("KOA1","KOA2","KOA3","KOA4"), c("KOB1", "KOB2", "KOB3", "KOB4"))
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.0.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
## ========================================
library(circlize)
library(cluster)
ann <- data.frame(condition=sample_info$condition)
colnames(ann) <- c("Condition")
colours <- list("Control"="lightblue", "KOA"="purple", "KOB"="pink")
rowAnn <- rowAnnotation(df=ann, boxplot=row_anno_boxplot(heat, border=FALSE, show_annotation_name=FALSE, gp=gpar(fill="#CCCCCC", fontsize=2), lim=c(-4,4), pch=".", size=unit(2, "mm"), col=sample_info$condition, annotation_width=unit(c(1, 7.5), "cm")))
hmap <- Heatmap(heat,
#split=sampleOrderSplit,
row_order=sampleOrder,
name="Gene expression\nZ-score",
col=colorRamp2(myBreaks, myCol),
heatmap_legend_param=list(color_bar="continuous", legend_direction="vertical", legend_width=unit(4,"cm"), title_position="topcenter", title_gp=gpar(fontsize=6, fontface="bold")),
cluster_rows=FALSE,
show_row_dend=FALSE,
row_title="",
row_title_side="left",
row_title_gp=gpar(fontsize=8, fontface="bold"),
row_title_rot=0,
show_row_names=TRUE,
row_names_gp=gpar(fontsize=8, fontface="bold"),
row_names_side="left",
row_dend_width=unit(30,"mm"),
cluster_columns=TRUE,
show_column_dend=TRUE,
column_title="Transcripts",
column_title_side="bottom",
column_title_gp=gpar(fontsize=12, fontface="bold"),
column_title_rot=0,
show_column_names=FALSE,
#column_names_gp=gpar(fontsize=termLab, fontface="bold"),
#column_names_max_height=unit(15, "cm"),
column_dend_height=unit(50,"mm"),
clustering_distance_columns=function(x) as.dist(1-cor(t(x))),
clustering_method_columns="ward.D2",
clustering_distance_rows=function(x) as.dist(1-cor(t(x))),
clustering_method_rows="ward.D2")
#top_annotation_height=unit(1.75,"cm"),
#bottom_annotation=sampleBoxplot)
#bottom_annotation_height=unit(4, "cm"))
draw(hmap + rowAnn, heatmap_legend_side="left", annotation_legend_side="left")
Heat map of all significant genes
# set the thresholds
padj.cutoff <- 0.05
lfc.cutoff <- 2
sigOE <- merged_pcresults_sorted_tb %>%
filter(padj < padj.cutoff & abs(log2FoldChange) > lfc.cutoff)
norm_OEsig <- normalized_counts_tb[,c(1,2:13)] %>%
filter(gene %in% sigOE$gene) %>%
data.frame() %>%
column_to_rownames(var = "gene")
annotation <- sample_info %>%
select(sample, condition) %>%
data.frame(row.names = "sample")
heat_colors <- brewer.pal(6, "YlOrRd")
pheatmap(norm_OEsig,
color = heat_colors,
cluster_rows = T,
show_rownames = F,
annotation = annotation,
border_color = NA,
fontsize = 10,
scale = "row",
fontsize_row = 10,
height = 20)
Volcano Plot
main <- "KO vs WT Volcano Plot"
merged_pcresults_sorted$genenames <- merged_pcresults_sorted$GeneSymbol
toptable <- data.frame(merged_pcresults_sorted)
# check how many NA there are under padj
sum(is.na(merged_pcresults_sorted$padj))
## [1] 10674
source("~/Documents/EPO Project/CRISPR/Whole gene knock-out/Confirming EPO KO/RNA Sequencing/DeSeq2/VolcanoPlot.R")
volcano <- VolcanoPlot(toptable, 0.05, 0.05, 0.05, 2, "Volcano Plot Control vs KO")
volcano
## Warning: Removed 10674 rows containing missing values (geom_point).
# Control vs KOA
main <- "KOA vs WT Volcano Plot"
merged_pcresults_sorted_KOA$genenames <- merged_pcresults_sorted_KOA$GeneSymbol
toptable <- data.frame(merged_pcresults_sorted_KOA)
# check how many NA there are under padj
sum(is.na(merged_pcresults_sorted_KOA$padj))
## [1] 8785
source("~/Documents/EPO Project/CRISPR/Whole gene knock-out/Confirming EPO KO/RNA Sequencing/DeSeq2/VolcanoPlot.R")
volcano <- VolcanoPlot(toptable, 0.05, 0.05, 0.05, 2, "Volcano Plot Control vs KOA")
volcano
## Warning: Removed 8785 rows containing missing values (geom_point).
# Control vs KOB
main <- "KOB vs WT Volcano Plot"
merged_pcresults_sorted_KOB$genenames <- merged_pcresults_sorted_KOB$GeneSymbol
toptable <- data.frame(merged_pcresults_sorted_KOB)
# check how many NA there are under padj
sum(is.na(merged_pcresults_sorted_KOB$padj))
## [1] 10088
source("~/Documents/EPO Project/CRISPR/Whole gene knock-out/Confirming EPO KO/RNA Sequencing/DeSeq2/VolcanoPlot.R")
volcano <- VolcanoPlot(toptable, 0.05, 0.05, 0.05, 2, "Volcano Plot Control vs KOB")
volcano
## Warning: Removed 10088 rows containing missing values (geom_point).
As we have two KO cell lines, we want to refine our list of overlapping DEGs and identify those which as significantly differentially expressed between Control and both KOA and KOB.
First we need to merge the results of Control vs KOA and Control vs KOB together
merge_KOA_KOB <- merge(merged_pcresults_sorted_KOA,merged_pcresults_sorted_KOB,by="Geneid", suffix=c(".WTvsKOA", ".WTvsKOB"), all=T)
Calculate the mean PC by average PC1 for WTvsKOA and PC1 for WTvsKOB
merge_KOA_KOB$meanPC <- (merge_KOA_KOB$PC1.WTvsKOA + merge_KOA_KOB$PC1.WTvsKOB)/2
#sort by mean PC value
merge_KOA_KOB_pcsorted <- merge_KOA_KOB[order(-merge_KOA_KOB$meanPC), ]
Merge with the overall results when combining all KOs
merge_KOA_KOB_pcsorted_all <- merge(merge_KOA_KOB_pcsorted, merged_pcresults_sorted, by="Geneid")
# Sort by mean PC
merge_KOA_KOB_pcsorted_all <- merge_KOA_KOB_pcsorted_all[order(-merge_KOA_KOB_pcsorted_all$meanPC),]
# write.table(merge_KOA_KOB1_pcsorted_1, "../Overlapping DEG analysis/merged_KOA_KOB_analysis_PC_sorted.txt", sep="/t", row.names=F, quote=F)
Take top 500 genes based on average PC value
top500_merge_KOA_KOB_pcsorted_all <- merge_KOA_KOB_pcsorted_all[order(-merge_KOA_KOB_pcsorted_all$meanPC),][1:500,]
# write.table(top500_merge_KOA_KOB1_pcsorted_1, "../Overlapping DEG analysis/Top_500_DEGs_basedon_meanPC1_KOA_KOB_analysis.txt", sep="\t", row.names=F, quote=F)
Heatmap of the top 500 genes
row.names(top500_merge_KOA_KOB_pcsorted_all) <- top500_merge_KOA_KOB_pcsorted_all$Geneid
topgenes <- head(rownames(top500_merge_KOA_KOB_pcsorted_all),500)
mat <- rldMatrix[topgenes,]
mat <- mat - rowMeans(mat)
mat2 <- data.frame(mat)
mat2$Geneid <- row.names(mat2)
mat2 <- join(mat2, gtf,by="Geneid")
row.names(mat2) <- mat2$GeneSymbol
mat2 <- mat2[,-c(13:18)]
mat2 <- data.matrix(mat2)
hmcol <- colorRampPalette(brewer.pal(11, "RdBu"))(50)
df <- as.data.frame(colData(dds)[,c("genotype", "condition")])
# scale = "row" converts to z-score scaling within rows
heatmap.2(mat2, main="Top 50 DEGs", cexRow=0.6, cexCol=0.8, col=hmcol, Rowv=T, Colv=F, labCol=c("Control", NA,NA,NA," KOA",NA,NA,NA," KOB",NA,NA,NA), adjCol=c(-0.8,-62),srtCol=0, scale="row", dendrogram="none", trace="none", margin=c(1, 5), key.title="Counts density", key.xlab="Counts", key.ylab="")
pheatmap(mat2, annotation_col=df,fontsize_col = 7, clustering_distance_rows = "euclidean", clustering_method="ward.D2", annotation_row=NA, show_rownames=F, cluster_cols = F,cluster_rows = T, main="Top 500 for segregating Control from KO along PC1 in PCA")
Venn Diagram of WT vs KOA & WT vs KOB
library(VennDiagram)
## Loading required package: futile.logger
##
## Attaching package: 'VennDiagram'
## The following object is masked from 'package:dendextend':
##
## rotate
y <- list()
y$KOA <- as.character(row.names(merged_pcresults_sorted_KOA))
y$KOB <- as.character(row.names(merged_pcresults_sorted_KOB))
# Generate plot
myCol <- brewer.pal(3, "RdBu")
# Pink: #FF9CEE, purple/blue: #AFCBFF, pastel yellow: #FFF5BA, Light blue: #ACE7FF
w <- venn.diagram(y,
#circles
lwd = 2,
lty = 'blank',
col="transparent",
fill = c(alpha("#FFF5BA",0.5), alpha('#ACE7FF',0.5)), cex=2,
# font inside circles
fontface = "plain",
fontfamily = "sans",
# category names
cat.cex=2, cat.fontface="bold", cat.default.pos="outer",
cat.pos=c(-10,10), cat.dist=c(0.055,0.055), cat.fontfamily="sans",
filename=NULL,alpha=0.7, scaled=TRUE)
grid.newpage()
grid.draw(w)
x <- list()
x$WTvsKOA <- as.character(row.names(merged_pcresults_sorted_KOA_sigFC2))
x$WTvsKOB <- as.character(row.names(merged_pcresults_sorted_KOB_sigFC2))
# Generate plot
# Pink: #FF9CEE, purple/blue: #AFCBFF, pastel yellow: #FFF5BA, Light blue: #ACE7FF
v <- venn.diagram(x,
#circles
lwd = 2,
lty = 'blank',
col="transparent",
fill = c(alpha("#FFF5BA",0.5), alpha('#ACE7FF',0.5)), cex=2,
# font inside circles
fontface = "plain",
fontfamily = "sans",
# category names
cat.cex=2, cat.fontface="bold", cat.default.pos="outer",
cat.pos=c(-10,10), cat.dist=c(0.055,0.055), cat.fontfamily="sans",
filename=NULL,alpha=0.7, scaled=TRUE)
# have a look at the default plot
grid.newpage()
grid.draw(v)
We now know that 314 genes are overlapping and significant in both KOs.
# have a look at the names in the plot object v
# We are interested in the labels
lapply(v, function(i) i$label)
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## [1] "379"
##
## [[6]]
## [1] "350"
##
## [[7]]
## [1] "314"
##
## [[8]]
## [1] "WTvsKOA"
##
## [[9]]
## [1] "WTvsKOB"
# Over-write labels (5 to 7 chosen by manual check of labels)
# in KOA only
v[[5]]$label <- paste(setdiff(x$WTvsKOA, x$WTvsKOB), collapse="\n")
only_in_KOA <- data.frame(paste(setdiff(x$WTvsKOA, x$WTvsKOB)))
# in KOB only
v[[6]]$label <- paste(setdiff(x$WTvsKOB, x$WTvsKOA) , collapse="\n")
only_in_KOB <- data.frame(paste(setdiff(x$WTvsKOB, x$WTvsKOA)))
# intersection i.e the genes that are signifcant in both analyses
v[[7]]$label <- paste(intersect(x$WTvsKOA, x$WTvsKOB), collapse="\n")
KOA_KOB <- data.frame(paste(intersect(x$WTvsKOA, x$WTvsKOB)))
## Obtain gene names and their values for each list
# overlapping list
KOA_KOB$Geneid <- KOA_KOB$paste.intersect.x.WTvsKOA..x.WTvsKOB..
KOA_KOB_values <- merge(KOA_KOB, merge_KOA_KOB, by="Geneid")
sum(abs(KOA_KOB_values$padj.WTvsKOB)>0.05)
## [1] 0
# Only significant DEG in KOA
only_in_KOA$Geneid <- only_in_KOA$paste.setdiff.x.WTvsKOA..x.WTvsKOB..
only_in_KOA_values <- merge(only_in_KOA, merge_KOA_KOB, by="Geneid")
# Only significant DEG in KOB
only_in_KOB$Geneid <- only_in_KOB$paste.setdiff.x.WTvsKOB..x.WTvsKOA..
only_in_KOB_values <- merge(only_in_KOB, merge_KOA_KOB, by="Geneid")
# Create a column of mean PC values
KOA_KOB_values$meanPC <- (KOA_KOB_values$PC1.WTvsKOA + KOA_KOB_values$PC1.WTvsKOB)/2
KOA_KOB_values_combined <- merge(KOA_KOB_values, merged_pcresults, by="Geneid")
sum(KOA_KOB_values_combined$padj>0.05, na.rm=T)
## [1] 6
KOA_KOB_values_combined_sortedPC <- KOA_KOB_values_combined[order(-KOA_KOB_values_combined$meanPC), ]
#### Upregulated genes
upreg_KOA_KOB_overlap <- KOA_KOB_values_combined_sortedPC[KOA_KOB_values_combined_sortedPC$log2FoldChange>0,]
#write.table(upreg_KOA_KOB_overlap, file="../Overlapping_genes_KOA_KOB_analysis_Upregulated.txt", sep="\t", row.names=F, quote=F)
#### Downregulated genes
downreg_KOA_KOB_overlap <- KOA_KOB_values_combined_sortedPC[KOA_KOB_values_combined_sortedPC$log2FoldChange<0,]
#write.table(downreg_KOA_KOB_overlap, file="../Overlapping_genes_KOA_KOB_analysis_Downregulated.txt", sep="\t", row.names=F, quote=F)
row.names(KOA_KOB_values_combined_sortedPC) <- KOA_KOB_values_combined_sortedPC$Geneid
overlapping_genes <- rownames(KOA_KOB_values_combined_sortedPC)
library(plyr)
library(RColorBrewer)
mat <- rldMatrix[overlapping_genes,]
mat <- mat - rowMeans(mat)
mat2 <- data.frame(mat)
mat2$Geneid <- row.names(mat2)
mat2 <- join(mat2, gtf,by="Geneid")
row.names(mat2) <- mat2$GeneSymbol
mat2 <- mat2[,-c(13:18)]
mat2 <- data.matrix(mat2)
hmcol <- colorRampPalette(brewer.pal(11, "RdBu"))(50)
df <- as.data.frame(colData(dds)[,c("genotype", "condition")])
# scale = "row" converts to z-score scaling within rows
library(gplots)
heatmap.2(mat2, main="Heat map of overlapping genes", cexRow=0.6, labRow=NA,cexCol=0.8, col=hmcol, Rowv=T, Colv=T, labCol=c("Control", NA,NA,NA," KOA",NA,NA,NA, " KOB",NA,NA,NA), adjCol=c(-2.5,50),srtCol=0, scale="row", dendrogram="column", trace="none", margin=c(1, 5), key.title="Counts density", key.xlab="Counts", key.ylab="")
row.names(KOA_KOB_values_combined_sortedPC) <- KOA_KOB_values_combined_sortedPC$Geneid
topgenes <- head(rownames(KOA_KOB_values_combined_sortedPC),50)
mat <- rldMatrix[topgenes,]
mat <- mat - rowMeans(mat)
mat2 <- data.frame(mat)
mat2$Geneid <- row.names(mat2)
mat2 <- join(mat2, gtf,by="Geneid")
row.names(mat2) <- mat2$GeneSymbol
mat2 <- mat2[,-c(13:18)]
mat2 <- data.matrix(mat2)
hmcol <- colorRampPalette(brewer.pal(11, "RdBu"))(50)
df <- as.data.frame(colData(dds)[,c("genotype", "condition")])
library(gplots)
heatmap.2(mat2, main="Top 50 DEGs", cexRow=0.6, cexCol=0.8, col=hmcol, Rowv=T, Colv=F, labCol=c("Control", NA,NA,NA," KOA",NA,NA,NA," KOB",NA,NA,NA), adjCol=c(-2,-85),srtCol=0, scale="row", dendrogram="none", trace="none", margin=c(1, 5), key.title="Counts density", key.xlab="Counts", key.ylab="")
# Alternative plotting method
# pheatmap(mat2, annotation_col=df, fontsize_row = 5, fontsize_col = 7, cluster_cols = F,cluster_rows = F, main="Top 50 DEGs \n P<0.05 & abs|Log2 Fold Change|>2")
library(tibble)
library(tidyr)
library(ggplot2)
library(tidyverse)
KOA_KOB_values_combined_sortedPC_tb <- KOA_KOB_values_combined_sortedPC %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble()
top50_DEGs_genes <- KOA_KOB_values_combined_sortedPC_tb %>%
arrange(-meanPC) %>% #Arrange rows by padj values
pull(gene) %>% #Extract character vector of ordered genes
head(n=50)
top50_DEG_genes <- merged_pcresults_sorted[top50_DEGs_genes,]
top50_DEG_genes <- data.frame(top50_DEG_genes)
top50_DEG_genes$Geneid <- row.names(top50_DEG_genes)
top50_DEG_genes <- merge(top50_DEG_genes, gtf, by="Geneid")
#change the normalised counts into a tibble
normalized_counts_tb <- norm %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble()
top50_DEG_norm <- normalized_counts_tb %>%
filter(gene %in% top50_DEGs_genes)
gathered_top50_DEGs <- top50_DEG_norm %>%
tidyr::gather(colnames(top50_DEG_norm)[2:13], key = "sample", value = "normalized_counts")
gathered_top50_DEGs <- inner_join(gathered_top50_DEGs, sample_info, by="sample")
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
names(gathered_top50_DEGs)[1] <- "Geneid"
gathered_top50_DEGs<- inner_join(gtf, gathered_top50_DEGs, by="Geneid")
## Warning: Column `Geneid` joining factor and character vector, coercing into
## character vector
gathered_top50_DEGs<- inner_join(merged_pcresults_sorted, gathered_top50_DEGs, by="Geneid")
## Warning: Column `Geneid` joining factor and character vector, coercing into
## character vector
#write.table(gathered_top20_DEGs, "Differential Gene Expression/Top_50_DEGs.csv", row.names=F, quote=F, sep=",", col.names=T)
#png("BarPlot_Top50_Overlapping_significant_genes.png", width=1200, height=800)
ggplot(gathered_top50_DEGs) +
geom_point(aes(x = GeneSymbol.x, y = normalized_counts, color = condition)) +
xlab("Genes") +
ylab("Regularised Log Counts") +
ggtitle("Top 50 Overlapping Significant Genes (P<0.05 & abs(Log2FC) > 2)") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size=6)) +
theme(plot.title = element_text(hjust = 0.5))
# dev.off()